In [1]:
# import libraries
import pandas as pd
import numpy as np 

In [2]:
# read data
df=pd.read_csv('games.csv')

In [3]:
# check size
df.shape

(16719, 16)

In [4]:
# brief look at columns and thier data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16719 entries, 0 to 16718
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             16717 non-null  object 
 1   Platform         16719 non-null  object 
 2   Year_of_Release  16450 non-null  float64
 3   Genre            16717 non-null  object 
 4   Publisher        16665 non-null  object 
 5   NA_Sales         16719 non-null  float64
 6   EU_Sales         16719 non-null  float64
 7   JP_Sales         16719 non-null  float64
 8   Other_Sales      16719 non-null  float64
 9   Global_Sales     16719 non-null  float64
 10  Critic_Score     8137 non-null   float64
 11  Critic_Count     8137 non-null   float64
 12  User_Score       10015 non-null  object 
 13  User_Count       7590 non-null   float64
 14  Developer        10096 non-null  object 
 15  Rating           9950 non-null   object 
dtypes: float64(9), object(7)
memory usage: 2.0+ MB


We can change data type for column 'Platform' to category and reduce memory usage

In [5]:
df.Platform = df.Platform.astype('category')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16719 entries, 0 to 16718
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   Name             16717 non-null  object  
 1   Platform         16719 non-null  category
 2   Year_of_Release  16450 non-null  float64 
 3   Genre            16717 non-null  object  
 4   Publisher        16665 non-null  object  
 5   NA_Sales         16719 non-null  float64 
 6   EU_Sales         16719 non-null  float64 
 7   JP_Sales         16719 non-null  float64 
 8   Other_Sales      16719 non-null  float64 
 9   Global_Sales     16719 non-null  float64 
 10  Critic_Score     8137 non-null   float64 
 11  Critic_Count     8137 non-null   float64 
 12  User_Score       10015 non-null  object  
 13  User_Count       7590 non-null   float64 
 14  Developer        10096 non-null  object  
 15  Rating           9950 non-null   object  
dtypes: category(1), float64(9), object(6)
me

Let's do the same for 'Genre'

In [7]:
df.Genre.nunique()

12

In [8]:
df.Genre = df.Genre.astype('category')

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16719 entries, 0 to 16718
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   Name             16717 non-null  object  
 1   Platform         16719 non-null  category
 2   Year_of_Release  16450 non-null  float64 
 3   Genre            16717 non-null  category
 4   Publisher        16665 non-null  object  
 5   NA_Sales         16719 non-null  float64 
 6   EU_Sales         16719 non-null  float64 
 7   JP_Sales         16719 non-null  float64 
 8   Other_Sales      16719 non-null  float64 
 9   Global_Sales     16719 non-null  float64 
 10  Critic_Score     8137 non-null   float64 
 11  Critic_Count     8137 non-null   float64 
 12  User_Score       10015 non-null  object  
 13  User_Count       7590 non-null   float64 
 14  Developer        10096 non-null  object  
 15  Rating           9950 non-null   object  
dtypes: category(2), float64(9), object(5)
me

Now we are not using as much memory. Let's see how the dataframe looks like

In [10]:
df

Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating
0,Wii Sports,Wii,2006.0,Sports,Nintendo,41.36,28.96,3.77,8.45,82.53,76.0,51.0,8,322.0,Nintendo,E
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,,,,,,
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.68,12.76,3.79,3.29,35.52,82.0,73.0,8.3,709.0,Nintendo,E
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.61,10.93,3.28,2.95,32.77,80.0,73.0,8,192.0,Nintendo,E
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.00,31.37,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16714,Samurai Warriors: Sanada Maru,PS3,2016.0,Action,Tecmo Koei,0.00,0.00,0.01,0.00,0.01,,,,,,
16715,LMA Manager 2007,X360,2006.0,Sports,Codemasters,0.00,0.01,0.00,0.00,0.01,,,,,,
16716,Haitaka no Psychedelica,PSV,2016.0,Adventure,Idea Factory,0.00,0.00,0.01,0.00,0.01,,,,,,
16717,Spirits & Spells,GBA,2003.0,Platform,Wanadoo,0.01,0.00,0.00,0.00,0.01,,,,,,


Not sure what does column 'Rating' mean. 

In [11]:
# check values for column Rating
df.Rating.value_counts(dropna=False)

NaN     6769
E       3991
T       2961
M       1563
E10+    1420
EC         8
K-A        3
RP         3
AO         1
Name: Rating, dtype: int64

After googling I find out that each these letters are responsible for ages since when kids are allowed to play."RP" means that there is no rating yet, we can replace it with 'NAN'

In [12]:
# replace values
df['Rating']=df.Rating.replace('RP', np.NaN)

In [13]:
# check if everything is correct
df.Rating.value_counts(dropna=False)

NaN     6772
E       3991
T       2961
M       1563
E10+    1420
EC         8
K-A        3
AO         1
Name: Rating, dtype: int64

In [14]:
df.isna().sum()

Name                  2
Platform              0
Year_of_Release     269
Genre                 2
Publisher            54
NA_Sales              0
EU_Sales              0
JP_Sales              0
Other_Sales           0
Global_Sales          0
Critic_Score       8582
Critic_Count       8582
User_Score         6704
User_Count         9129
Developer          6623
Rating             6772
dtype: int64

In [15]:
# delete missing values from columns 'Name', 'Genre' nad 'Publisher'
df = df.dropna(subset=['Name', 'Genre', 'Publisher']).reset_index(drop=True)

In [16]:
df.isna().sum()

Name                  0
Platform              0
Year_of_Release     247
Genre                 0
Publisher             0
NA_Sales              0
EU_Sales              0
JP_Sales              0
Other_Sales           0
Global_Sales          0
Critic_Score       8530
Critic_Count       8530
User_Score         6656
User_Count         9079
Developer          6576
Rating             6723
dtype: int64

In [17]:
df[df['Year_of_Release'].isna()]

Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating
183,Madden NFL 2004,PS2,,Sports,Electronic Arts,4.26,0.26,0.01,0.71,5.23,94.0,29.0,8.5,140.0,EA Tiburon,E
377,FIFA Soccer 2004,PS2,,Sports,Electronic Arts,0.59,2.36,0.04,0.51,3.49,84.0,20.0,6.4,76.0,EA Canada,E
456,LEGO Batman: The Videogame,Wii,,Action,Warner Bros. Interactive Entertainment,1.80,0.97,0.00,0.29,3.06,74.0,17.0,7.9,22.0,Traveller's Tales,E10+
608,Space Invaders,2600,,Shooter,Atari,2.36,0.14,0.00,0.03,2.53,,,,,,
626,Rock Band,X360,,Misc,Electronic Arts,1.93,0.33,0.00,0.21,2.47,92.0,72.0,8.2,178.0,Harmonix Music Systems,T
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16321,PDC World Championship Darts 2008,PSP,,Sports,Oxygen Interactive,0.01,0.00,0.00,0.00,0.01,43.0,7.0,tbd,,Oxygen Interactive,E10+
16354,Freaky Flyers,GC,,Racing,Unknown,0.01,0.00,0.00,0.00,0.01,69.0,14.0,6.5,6.0,Midway,T
16397,Inversion,PC,,Shooter,Namco Bandai Games,0.01,0.00,0.00,0.00,0.01,59.0,6.0,6.7,107.0,Saber Interactive,M
16407,Hakuouki: Shinsengumi Kitan,PS3,,Adventure,Unknown,0.01,0.00,0.00,0.00,0.01,,,,,,


Sometimes column 'Name' has its 'Year_of_Release' while it is missing in 'Year_of_Release' column, let's extract it

In [18]:
# by using regex which extracts 4 last digits from column "Name" we can try to increase number of vaues in column 'Year_of_Release'
df['Year_of_Release'] = df['Year_of_Release'].fillna(df['Name'].str.extract(r'(\d{4})', expand=False).astype(float))

In [19]:
df.isna().sum()

Name                  0
Platform              0
Year_of_Release     231
Genre                 0
Publisher             0
NA_Sales              0
EU_Sales              0
JP_Sales              0
Other_Sales           0
Global_Sales          0
Critic_Score       8530
Critic_Count       8530
User_Score         6656
User_Count         9079
Developer          6576
Rating             6723
dtype: int64

Now there are less missing values in this column, let's take a look if we can do something else

In [20]:
df[df['Year_of_Release'].isna()].head(25)

Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating
456,LEGO Batman: The Videogame,Wii,,Action,Warner Bros. Interactive Entertainment,1.8,0.97,0.0,0.29,3.06,74.0,17.0,7.9,22.0,Traveller's Tales,E10+
608,Space Invaders,2600,,Shooter,Atari,2.36,0.14,0.0,0.03,2.53,,,,,,
626,Rock Band,X360,,Misc,Electronic Arts,1.93,0.33,0.0,0.21,2.47,92.0,72.0,8.2,178.0,Harmonix Music Systems,T
656,Frogger's Adventures: Temple of the Frog,GBA,,Adventure,Konami Digital Entertainment,2.15,0.18,0.0,0.07,2.39,73.0,4.0,tbd,,Konami Computer Entertainment Hawaii,E
676,LEGO Indiana Jones: The Original Adventures,Wii,,Action,LucasArts,1.51,0.61,0.0,0.21,2.34,78.0,22.0,6.6,28.0,Traveller's Tales,E10+
717,Call of Duty 3,Wii,,Shooter,Activision,1.17,0.84,0.0,0.23,2.24,69.0,42.0,6.7,61.0,Exakt,T
803,Rock Band,Wii,,Misc,MTV Games,1.33,0.56,0.0,0.2,2.08,80.0,21.0,6.3,37.0,Harmonix Music Systems,T
1129,Call of Duty: Black Ops,PC,,Shooter,Activision,0.58,0.81,0.0,0.23,1.63,81.0,29.0,5.2,1651.0,Treyarch,M
1140,Rock Band,PS3,,Misc,Electronic Arts,0.99,0.41,0.0,0.22,1.62,92.0,35.0,8.4,107.0,Harmonix Music Systems,T
1503,Adventure,2600,,Adventure,Atari,1.21,0.08,0.0,0.01,1.3,,,,,,


In [21]:
df[df['Year_of_Release'].isna()].tail(15)

Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating
15707,Virtua Quest,PS2,,Role-Playing,Unknown,0.01,0.01,0.0,0.0,0.02,53.0,17.0,7.6,5.0,Artificial Mind and Movement,T
15748,Shonen Jump's Yu-Gi-Oh! GX Card Almanac,DS,,Misc,Konami Digital Entertainment,0.0,0.0,0.02,0.0,0.02,,,,,,
15769,Without Warning,XB,,Shooter,Capcom,0.01,0.0,0.0,0.0,0.02,45.0,25.0,2,5.0,CiRCLE,M
15919,Dinotopia: The Sunstone Odyssey,GC,,Action,Unknown,0.01,0.0,0.0,0.0,0.02,50.0,4.0,tbd,,Vicious Cycle,T
15951,Jet Impulse,DS,,Simulation,Nintendo,0.0,0.0,0.02,0.0,0.02,,,,,,
16011,Dance! It's Your Stage,Wii,,Misc,DTP Entertainment,0.0,0.01,0.0,0.0,0.02,,,,,,
16032,Ferrari: The Race Experience,Wii,,Racing,System 3 Arcade Software,0.0,0.01,0.0,0.0,0.01,54.0,4.0,,,System 3,E
16109,Aquaman: Battle for Atlantis,XB,,Action,Unknown,0.01,0.0,0.0,0.0,0.01,26.0,13.0,2.7,15.0,Lucky Chicken,T
16132,WRC: FIA World Rally Championship,PC,,Racing,Black Bean Games,0.0,0.01,0.0,0.0,0.01,65.0,7.0,,,Black Bean Games,
16240,Shorts,DS,,Platform,Unknown,0.01,0.0,0.0,0.0,0.01,,,tbd,,Artificial Mind and Movement,E10+


Seems like we do not have any other possibilities to replace NaNs

In [22]:
df = df.dropna(subset=['Year_of_Release']).reset_index(drop=True)

In [23]:
df.isna().sum()

Name                  0
Platform              0
Year_of_Release       0
Genre                 0
Publisher             0
NA_Sales              0
EU_Sales              0
JP_Sales              0
Other_Sales           0
Global_Sales          0
Critic_Score       8438
Critic_Count       8438
User_Score         6583
User_Count         8961
Developer          6516
Rating             6655
dtype: int64

Column 'Critic_Count' responsible for saying how many critics gave their score to the current game. Is there any sense to store scores for games where only 1 or 2 critics have voted ?

In [24]:
df.Critic_Count.mean()

26.427320490367777

On average we have 26 critics per game

In [25]:
df[df['Critic_Count']<3]

Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating


Let's do the same for User_Count

In [26]:
df[df['User_Count']<3]

Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating


Ok, so we do not have situations when the score of the game is based on 1 or 2 people, now we can remove these columns

In [27]:
df = df.drop(columns=['Critic_Count', 'User_Count'])

In [28]:
df.isna().sum()

Name                  0
Platform              0
Year_of_Release       0
Genre                 0
Publisher             0
NA_Sales              0
EU_Sales              0
JP_Sales              0
Other_Sales           0
Global_Sales          0
Critic_Score       8438
User_Score         6583
Developer          6516
Rating             6655
dtype: int64

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16432 entries, 0 to 16431
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   Name             16432 non-null  object  
 1   Platform         16432 non-null  category
 2   Year_of_Release  16432 non-null  float64 
 3   Genre            16432 non-null  category
 4   Publisher        16432 non-null  object  
 5   NA_Sales         16432 non-null  float64 
 6   EU_Sales         16432 non-null  float64 
 7   JP_Sales         16432 non-null  float64 
 8   Other_Sales      16432 non-null  float64 
 9   Global_Sales     16432 non-null  float64 
 10  Critic_Score     7994 non-null   float64 
 11  User_Score       9849 non-null   object  
 12  Developer        9916 non-null   object  
 13  Rating           9777 non-null   object  
dtypes: category(2), float64(7), object(5)
memory usage: 1.5+ MB


Hmm, User_Score has object data type, but it should be float

In [30]:
df.User_Score.value_counts()

tbd    2378
7.8     322
8       285
8.2     277
8.3     252
       ... 
1.1       2
1.9       2
9.6       2
0         1
9.7       1
Name: User_Score, Length: 96, dtype: int64

We need to replace 'tbd' with NaN, 'tbd' does not give us any information 

In [31]:
df.User_Score = df.User_Score.replace('tbd', np.NaN)

In [32]:
df.User_Score.value_counts()

7.8    322
8      285
8.2    277
8.3    252
8.5    249
      ... 
1.5      2
0.3      2
1.1      2
0        1
9.7      1
Name: User_Score, Length: 95, dtype: int64

In [33]:
df.User_Score = df.User_Score.astype( 'float64')

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16432 entries, 0 to 16431
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   Name             16432 non-null  object  
 1   Platform         16432 non-null  category
 2   Year_of_Release  16432 non-null  float64 
 3   Genre            16432 non-null  category
 4   Publisher        16432 non-null  object  
 5   NA_Sales         16432 non-null  float64 
 6   EU_Sales         16432 non-null  float64 
 7   JP_Sales         16432 non-null  float64 
 8   Other_Sales      16432 non-null  float64 
 9   Global_Sales     16432 non-null  float64 
 10  Critic_Score     7994 non-null   float64 
 11  User_Score       7471 non-null   float64 
 12  Developer        9916 non-null   object  
 13  Rating           9777 non-null   object  
dtypes: category(2), float64(8), object(4)
memory usage: 1.5+ MB


In [35]:
df.isna().sum()

Name                  0
Platform              0
Year_of_Release       0
Genre                 0
Publisher             0
NA_Sales              0
EU_Sales              0
JP_Sales              0
Other_Sales           0
Global_Sales          0
Critic_Score       8438
User_Score         8961
Developer          6516
Rating             6655
dtype: int64

We have a lot of missing values for columns 'Critic_Score' and 'User_Score'. We can try to find correlation between scores and sales in different regions. 

In [36]:
NA_c = df['NA_Sales'].corr(df['Critic_Score'])
print(NA_c, 'corr in NA')
EU_c = df['EU_Sales'].corr(df['Critic_Score'])
print(EU_c, 'corr in EU')
JP_c = df['JP_Sales'].corr(df['Critic_Score'])
print(JP_c, 'corr in JP')
Other_c = df['Other_Sales'].corr(df['Critic_Score'])
print(Other_c, 'corr in Other')
Global_c = df['Global_Sales'].corr(df['Critic_Score'])
print(Global_c, 'corr in Global')

0.24066820993617297 corr in NA
0.2214014669796582 corr in EU
0.15317741223126294 corr in JP
0.19888851855769324 corr in Other
0.24575127364307878 corr in Global


All correlations are too small, let's do the same with User_Score

In [37]:
NA_c = df['NA_Sales'].corr(df['User_Score'])
print(NA_c, 'corr in NA')
EU_c = df['EU_Sales'].corr(df['User_Score'])
print(EU_c, 'corr in EU')
JP_c = df['JP_Sales'].corr(df['User_Score'])
print(JP_c, 'corr in JP')
Other_c = df['Other_Sales'].corr(df['User_Score'])
print(Other_c, 'corr in Other')
Global_c = df['Global_Sales'].corr(df['User_Score'])
print(Global_c, 'corr in Global')

0.08570359558394258 corr in NA
0.05509865754209435 corr in EU
0.12607109252494403 corr in JP
0.05684477341126955 corr in Other
0.08784045253513086 corr in Global


Same thing here. I am thinking about another option 

In [38]:
df.Genre.value_counts().count()

12

We have 12 unique genres. Let's replace each of them with integer and then we will try to find correlations. 

In [39]:
df.Genre.value_counts()

Action          3308
Sports          2318
Misc            1697
Role-Playing    1483
Shooter         1296
Adventure       1291
Racing          1228
Platform         878
Simulation       855
Fighting         837
Strategy         672
Puzzle           569
Name: Genre, dtype: int64

In [40]:
# create dictionary for each genre
genre_dict = { 'Action':1, 
              'Sports':2 , 
              'Misc':3 , 
              'Role-Playing':4, 
              'Shooter':5, 
              'Adventure':6, 
              'Racing':7, 
              'Platform':8, 
              'Simulation':9,
              'Fighting':10,
              'Strategy':11,
              'Puzzle':12}

In [41]:
# create new numeric genre column based on integer value of each genre 
df['Genre_numeric'] = df['Genre'].map(genre_dict)

In [42]:
df

Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,User_Score,Developer,Rating,Genre_numeric
0,Wii Sports,Wii,2006.0,Sports,Nintendo,41.36,28.96,3.77,8.45,82.53,76.0,8.0,Nintendo,E,2
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,,,,,8
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.68,12.76,3.79,3.29,35.52,82.0,8.3,Nintendo,E,7
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.61,10.93,3.28,2.95,32.77,80.0,8.0,Nintendo,E,2
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.00,31.37,,,,,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16427,Samurai Warriors: Sanada Maru,PS3,2016.0,Action,Tecmo Koei,0.00,0.00,0.01,0.00,0.01,,,,,1
16428,LMA Manager 2007,X360,2006.0,Sports,Codemasters,0.00,0.01,0.00,0.00,0.01,,,,,2
16429,Haitaka no Psychedelica,PSV,2016.0,Adventure,Idea Factory,0.00,0.00,0.01,0.00,0.01,,,,,6
16430,Spirits & Spells,GBA,2003.0,Platform,Wanadoo,0.01,0.00,0.00,0.00,0.01,,,,,8


In [43]:
G_u = df['Genre_numeric'].corr(df['User_Score'])
print(G_u, 'corr User')
G_c = df['Genre_numeric'].corr(df['Critic_Score'])
print(G_c, 'corr Critic')

0.04842453228410118 corr User
0.013849965429912894 corr Critic


Unfortunately in this way it also does not work

In [44]:
df=df.drop(columns='Genre_numeric')

In [45]:
df

Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,User_Score,Developer,Rating
0,Wii Sports,Wii,2006.0,Sports,Nintendo,41.36,28.96,3.77,8.45,82.53,76.0,8.0,Nintendo,E
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,,,,
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.68,12.76,3.79,3.29,35.52,82.0,8.3,Nintendo,E
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.61,10.93,3.28,2.95,32.77,80.0,8.0,Nintendo,E
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.00,31.37,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16427,Samurai Warriors: Sanada Maru,PS3,2016.0,Action,Tecmo Koei,0.00,0.00,0.01,0.00,0.01,,,,
16428,LMA Manager 2007,X360,2006.0,Sports,Codemasters,0.00,0.01,0.00,0.00,0.01,,,,
16429,Haitaka no Psychedelica,PSV,2016.0,Adventure,Idea Factory,0.00,0.00,0.01,0.00,0.01,,,,
16430,Spirits & Spells,GBA,2003.0,Platform,Wanadoo,0.01,0.00,0.00,0.00,0.01,,,,


In [48]:
# check for duplicates
df.duplicated().sum()

0

In [46]:
df.to_csv('games_visualization.csv')

This is the end of our work in Pandas. Now, let's move to Power BI