# Préparation des données

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("vgsales.csv", sep =",")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16598 entries, 0 to 16597
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Rank          16598 non-null  int64  
 1   Name          16598 non-null  object 
 2   Platform      16598 non-null  object 
 3   Year          16327 non-null  float64
 4   Genre         16598 non-null  object 
 5   Publisher     16540 non-null  object 
 6   NA_Sales      16598 non-null  float64
 7   EU_Sales      16598 non-null  float64
 8   JP_Sales      16598 non-null  float64
 9   Other_Sales   16598 non-null  float64
 10  Global_Sales  16598 non-null  float64
dtypes: float64(6), int64(1), object(4)
memory usage: 1.4+ MB


In [4]:
for col in df :
    print(col, df[col].nunique())

Rank 16598
Name 11493
Platform 31
Year 39
Genre 12
Publisher 578
NA_Sales 409
EU_Sales 305
JP_Sales 244
Other_Sales 157
Global_Sales 623


In [5]:
df["Genre"].unique()

array(['Sports', 'Platform', 'Racing', 'Role-Playing', 'Puzzle', 'Misc',
       'Shooter', 'Simulation', 'Action', 'Fighting', 'Adventure',
       'Strategy'], dtype=object)

In [6]:
df["Platform"].unique()

array(['Wii', 'NES', 'GB', 'DS', 'X360', 'PS3', 'PS2', 'SNES', 'GBA',
       '3DS', 'PS4', 'N64', 'PS', 'XB', 'PC', '2600', 'PSP', 'XOne', 'GC',
       'WiiU', 'GEN', 'DC', 'PSV', 'SAT', 'SCD', 'WS', 'NG', 'TG16',
       '3DO', 'GG', 'PCFX'], dtype=object)

In [7]:
df.isnull().sum()

Rank              0
Name              0
Platform          0
Year            271
Genre             0
Publisher        58
NA_Sales          0
EU_Sales          0
JP_Sales          0
Other_Sales       0
Global_Sales      0
dtype: int64

_On choisit de remplacer les NaN par "Unknown" et de supprimer les lignes avec des données inconnues (suppression de 1.6% des données)_

In [8]:
df["Publisher"].fillna("Unknown", inplace=True)

In [9]:
df.dropna(inplace=True)

In [10]:
df.isnull().sum()

Rank            0
Name            0
Platform        0
Year            0
Genre           0
Publisher       0
NA_Sales        0
EU_Sales        0
JP_Sales        0
Other_Sales     0
Global_Sales    0
dtype: int64

In [11]:
df["Year"] = df["Year"].astype(int)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16327 entries, 0 to 16597
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Rank          16327 non-null  int64  
 1   Name          16327 non-null  object 
 2   Platform      16327 non-null  object 
 3   Year          16327 non-null  int32  
 4   Genre         16327 non-null  object 
 5   Publisher     16327 non-null  object 
 6   NA_Sales      16327 non-null  float64
 7   EU_Sales      16327 non-null  float64
 8   JP_Sales      16327 non-null  float64
 9   Other_Sales   16327 non-null  float64
 10  Global_Sales  16327 non-null  float64
dtypes: float64(5), int32(1), int64(1), object(4)
memory usage: 1.4+ MB


In [13]:
def creer_dico(nom_dico, colonne):
    for i, colonne in enumerate(colonne.unique()):
        nom_dico[i+1] = colonne

In [14]:
dico_platforms = {}
dico_publishers = {}
dico_genres = {}
creer_dico(dico_platforms, df["Platform"])
creer_dico(dico_publishers, df["Publisher"])
creer_dico(dico_genres, df["Genre"])

In [15]:
for key, value in dico_publishers.items() :
    df["Publisher"].loc[df["Publisher"] == value] = key
for key, value in dico_platforms.items() :
    df["Platform"].loc[df["Platform"] == value] = key
for key, value in dico_genres.items() :
    df["Genre"].loc[df["Genre"] == value] = key

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [16]:
df.head(10)

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,1,2006,1,1,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,2,1985,2,1,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,1,2008,3,1,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,1,2009,1,1,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,3,1996,4,1,11.27,8.89,10.22,1.0,31.37
5,6,Tetris,3,1989,5,1,23.2,2.26,4.22,0.58,30.26
6,7,New Super Mario Bros.,4,2006,2,1,11.38,9.23,6.5,2.9,30.01
7,8,Wii Play,1,2006,6,1,14.03,9.2,2.93,2.85,29.02
8,9,New Super Mario Bros. Wii,1,2009,2,1,14.59,7.06,4.7,2.26,28.62
9,10,Duck Hunt,2,1984,7,1,26.93,0.63,0.28,0.47,28.31


In [17]:
autoincrement = np.array([i+1 for i in range(df.shape[0])])

In [18]:
df.insert(0,"id", autoincrement)

In [19]:
df.to_csv("videogames.csv", index=False)