In [48]:
import pandas as pd
import numpy as np

In [49]:
df_games = pd.read_csv("Data/Raw_Data/games.csv", sep=",")
df_indies = pd.read_csv("Data/Raw_Data/indie-games-developers.csv", sep=",")
df_studios = pd.read_csv("Data/Raw_Data/video-games-developers.csv", sep=",")
df_countries = pd.read_csv("Data/Raw_Data/Country-data.csv", sep=",")
df_cities = pd.read_csv("Data/Raw_Data/worldcities.csv", sep=",")


In [50]:
# For games, we keep the following columns only
df_games = df_games[['AppID', 'Name', 'Release date', 'Estimated owners',
'Required age', 'Price', 'DLCcount', 'About the game',
'Supported languages','Windows','Mac', 'Linux', 'Metacritic score', 'User score',
'Positive', 'Negative', 'Achievements','Average playtime forever',
'Developers','Categories', 'Genres']]

# Cleaning Supported languages column
df_games['Supported languages'] = df_games['Supported languages'].str.replace("'", '').str.replace("[", '').str.replace("]", '')

# Checking for the Primary Key
print(df_games.duplicated(subset=['AppID'], keep=False).sum())

df_games.head(2)


0


Unnamed: 0,AppID,Name,Release date,Estimated owners,Required age,Price,DLCcount,About the game,Supported languages,Windows,...,Linux,Metacritic score,User score,Positive,Negative,Achievements,Average playtime forever,Developers,Categories,Genres
0,20200,Galactic Bowling,"Oct 21, 2008",0 - 20000,0,19.99,0,Galactic Bowling is an exaggerated and stylize...,English,True,...,False,0,0,6,11,30,0,Perpetual FX Creative,"Single-player,Multi-player,Steam Achievements,...","Casual,Indie,Sports"
1,655370,Train Bandit,"Oct 12, 2017",0 - 20000,0,0.99,0,THE LAW!! Looks to be a showdown atop a train....,"English, French, Italian, German, Spanish - Sp...",True,...,False,0,0,53,5,12,0,Rusty Moyher,"Single-player,Steam Achievements,Full controll...","Action,Indie"


In [51]:
# Checking for the Primary Key
print(df_countries.duplicated(subset=['country'], keep=False).sum())

df_countries.head(2)


0


Unnamed: 0,country,child_mort,exports,health,imports,income,inflation,life_expec,total_fer,gdpp
0,Afghanistan,90.2,10.0,7.58,44.9,1610,9.44,56.2,5.82,553
1,Albania,16.6,28.0,6.55,48.6,9930,4.49,76.3,1.65,4090


In [52]:
# Checking for the Primary Key
print(df_cities.duplicated(subset=['id'], keep=False).sum())

df_cities.head(2)

0


Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id
0,Tokyo,Tokyo,35.687,139.7495,Japan,JP,JPN,Tōkyō,primary,37785000.0,1392685764
1,Jakarta,Jakarta,-6.175,106.8275,Indonesia,ID,IDN,Jakarta,primary,33756000.0,1360771077


In [53]:
# Checking for the Primary Key
print(df_indies.duplicated(subset=['Developer'], keep=False).sum())

# Dropping the duplicated rows
df_indies.drop_duplicates(subset=['Developer'], inplace=True)

# Checking for the Primary Key
print(df_indies.duplicated(subset=['Developer'], keep=False).sum())

df_indies.head(2)

2
0


Unnamed: 0,Developer,City,Autonomous area,Country,Notable games,Notes
0,11 bit studios,Warsaw,,Poland,AnomalyThis War of MineFrostpunk,
1,ACE Team,Santiago,Santiago,Chile,Zeno ClashRock of Ages,


In [54]:
# Checking for the Primary Key
print(df_studios.duplicated(subset=['Developer'], keep=False).sum())

df_studios.head(2)


0


Unnamed: 0,Developer,City,Administrative division,Country,Est.,"Notable games, series or franchises",Notes
0,0verflow,Tokyo,,Japan,1997,School DaysSummer DaysCross Days,Visual Novel brand (both developer and publisher)
1,11 bit studios,Warsaw,Masovian Voivodeship,Poland,2010,Frostpunk,Indie developer/publisher


In [55]:
# Checking Foreign Key between game and studios & indies

# We drop the rows with missing values for Developers
df_games.dropna(subset=['Developers'], inplace=True)

print(df_games['Developers'].unique().shape)

# We keep only the rows with developers that are in studios or indies. Definition of the foreign key.
df_games = df_games[df_games.Developers.isin(df_studios.Developer)]



(64655,)


In [56]:
list_cat = []
list_genres = []

for cat,genre in zip(df_games['Categories'], df_games['Genres']):
    if cat is not np.nan:
        for c in cat.split(','):
            if c not in list_cat:
                list_cat.append(c)
    if genre is not np.nan:
        for g in genre.split(','):
            if g not in list_genres:
                list_genres.append(g)

list_cat.sort()
list_genres.sort()

df_cat = pd.DataFrame({'Categories': list_cat, 'CategoryID': range(len(list_cat))})
df_genres = pd.DataFrame({'Genres': list_genres, 'GenreID': range(len(list_genres))})

df_cat.to_csv('Data/Clean_Data/categories.csv', index=False)
df_genres.to_csv('Data/Clean_Data/genres.csv', index=False)

categories_to_games = []
genres_to_games = []

for cat,genre,appID in zip(df_games['Categories'], df_games['Genres'], df_games['AppID']):
    if cat is not np.nan:
        for c in cat.split(','):
            catID = list_cat.index(c)
            categories_to_games.append([appID, catID])
    if genre is not np.nan:
        for g in genre.split(','):
            gID = list_genres.index(g)
            genres_to_games.append([appID, gID])

df_categories_to_games = pd.DataFrame(categories_to_games, columns=['AppID', 'CategoryID'])
df_genres_to_games = pd.DataFrame(genres_to_games, columns=['AppID', 'Genre'])

df_categories_to_games.to_csv('Data/Clean_Data/categories_to_games.csv', index=False)
df_genres_to_games.to_csv('Data/Clean_Data/genres_to_games.csv', index=False)

In [57]:
# Creating mutliple tables from games dataset
df_games.to_csv('Data/Clean_Data/games.csv', index=False)

# Name infos:
df_games[['AppID', 'Name']].to_csv('Data/Clean_Data/game_names.csv', index=False)

# Release infos:
df_games[['AppID', 'Release date','Required age', 'Price', 'DLCcount','Supported languages','Windows','Mac', 'Linux','Achievements']].to_csv('Data/Clean_Data/game_release.csv', index=False)

# Description infos:
df_games[['AppID', 'About the game']].to_csv('Data/Clean_Data/game_descriptions.csv', index=False)

# Stats infos:
df_games[['AppID', 'Estimated owners','Metacritic score', 'User score', 'Positive', 'Negative', 'Average playtime forever']].to_csv('Data/Clean_Data/game_stats.csv', index=False)

# Developers infos:
df_games[['AppID', 'Developers']].to_csv('Data/Clean_Data/game_developers.csv', index=False)

In [58]:
df_studios['Indie'] = df_studios.Developer.isin(df_indies.Developer)

In [59]:
df_studios.head(2)

df_studios.to_csv('Data/Clean_Data/studios.csv', index=False)

# Creating mutliple tables from games dataset

# Notes infos:
df_studios[['Developer','Notable games, series or franchises','Notes']].to_csv('Data/Clean_Data/studio_notes.csv', index=False)

# Global infos:
df_studios[['Developer','City','Administrative division','Country','Est.','Indie']].to_csv('Data/Clean_Data/studio_global.csv', index=False)


In [60]:
print(df_cities.duplicated(subset=['city','country'], keep=False).sum())

3694


In [61]:
df_studios[df_studios.duplicated(subset=['City'], keep=False) & (~df_studios.duplicated(subset=['City','Country'], keep=False))].head(20)

# We only have 2 cities with same name in different countries so it is indeed a different city

# We will assume that if the city and the country are the same, it is the same city


Unnamed: 0,Developer,City,Administrative division,Country,Est.,"Notable games, series or franchises",Notes,Indie
158,Digital Extremes,London,Ontario,Canada,1993,Warframe,"Subsidiary of Leyou; Founded by James Schmalz,...",False
169,The Dovetail Group,,,United States,c. 1984,,Early developer of music video games,False
654,Virtuos,,,Singapore,2004,Monster Jam: Path of DestructionGhost Recon Pr...,,False


In [62]:
df_studios[df_studios['City'] == 'London'].head(50)

Unnamed: 0,Developer,City,Administrative division,Country,Est.,"Notable games, series or franchises",Notes,Indie
74,The Bitmap Brothers,London,England,United Kingdom,1987,Xenon series,Dissolved.[4][5] Their portfolio was acquired ...,False
76,Bits Studios,London,England,United Kingdom,1991,Spider-Man: Return of the Sinister Six,Acquired by PlayWize in 2008,False
158,Digital Extremes,London,Ontario,Canada,1993,Warframe,"Subsidiary of Leyou; Founded by James Schmalz,...",False
203,Firefly Studios,London,England,United Kingdom,1999,Stronghold series,,True
258,Headstrong Games,London,England,United Kingdom,2000,Battalion Wars seriesArt Academy series,Subsidiary of Kuju Entertainment; defunct in 2017,False
289,Introversion Software,London,England,United Kingdom,2002,DarwiniaDEFCON,,True
330,Lift London,London,England,United Kingdom,2012,videogame developer for Microsoft Hololens,Subsidiary by Microsoft Studios in 2018,False
356,Mediatonic,London,England,United Kingdom,2005,Robot Unicorn AttackFable Fortune,,False
388,NaturalMotion,London,England,United Kingdom,2001,Backbreaker series,Acquired by Zynga,False
456,Playfish,London,England,United Kingdom,2007,The Sims Social,"Facebook, Myspace game developer Acquired by E...",False
