In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import ast
from datetime import datetime

from pathlib import Path
dataPath = Path("../data")
imagePath = Path("image")

## Data Processing for diagrams

# Retrieve the clean game data

In [2]:
games_clean = pd.read_csv(dataPath / "games_data_clean.csv")

In [3]:
# Convert the string representation to an array of strings
games_clean = games_clean.copy()
games_clean.loc[:,'Genres'] = games_clean['Genres'].apply(lambda x : ast.literal_eval(x))
games_clean.loc[:,'Platforms'] = games_clean['Platforms'].apply(lambda x : ast.literal_eval(x))
games_clean.loc[:,'Developers'] = games_clean['Developers'].apply(lambda x : ast.literal_eval(x))

# convert numerical value as int
games_clean.loc[:,'Plays'] = games_clean['Plays'].apply(lambda x : int(x))
games_clean.loc[:,'Playing'] = games_clean['Playing'].apply(lambda x : int(x))
games_clean.loc[:,'Backlogs'] = games_clean['Backlogs'].apply(lambda x : int(x))
games_clean.loc[:,'Wishlist'] = games_clean['Wishlist'].apply(lambda x : int(x))
games_clean.loc[:,'Reviews'] = games_clean['Reviews'].apply(lambda x : int(x))

# convert rating as float
games_clean.loc[:,'Rating'] = games_clean['Rating'].apply(lambda x : float(x))

# covert the relaease dtae in dta format
def convert_to_date(date_str):
    if date_str == 'TBD':
        return None  # Return None for "TBD" dates
    else:
        return datetime.strptime(date_str, '%Y-%m-%d').date()
    
games_clean.loc[:,'Release_Date'] = games_clean['Release_Date'].apply(lambda x : convert_to_date(x))

# Data processing genres

In [29]:
import math
def aggregate_Game_Elem(group):									
    return [{'Title' : elem['Title'],
             'Release_Date' : elem['Release_Date'].strftime("%Y-%m-%d"),
			 'Developers' : elem['Developers'],
			 'Summary' : "" if pd.isna(elem['Summary']) else elem['Summary'],
			 'Platforms' : elem['Platforms'],
			 'Genres' : elem['Genres'],
			 'Rating' : -1.0 if math.isnan(elem['Rating']) else elem['Rating'],
			 'Plays' : elem['Plays'],
			 'Playing' : elem['Playing'],
			 'Backlogs' : elem['Backlogs'],
			 'Wishlist' : elem['Wishlist'],
			 'Reviews' : elem['Reviews']
			 } for _, elem in group.iterrows()]

# extract the year
games_withYear = games_clean
games_withYear['Realease_Year'] = pd.to_datetime(games_clean['Release_Date']).dt.year
#remove all the year geater than 2023
games_withYear = games_withYear[games_withYear['Realease_Year'] <= 2023]
# explode list of genres
genres_exploded = games_withYear.explode('Genres')
# group by year and genres
genresYear_groupBy = genres_exploded.groupby(['Realease_Year', 'Genres'])
genresYear_groupBy = genresYear_groupBy.apply(aggregate_Game_Elem)

  genresYear_groupBy = genresYear_groupBy.apply(aggregate_Game_Elem)


In [30]:
from collections import defaultdict

genresDict = defaultdict(lambda: defaultdict(list))

for (year, genre), listElem in genresYear_groupBy.items():
    genresDict[year][genre] = listElem

In [31]:
import json
with open("games_by_year_and_genre.json", 'w') as f:
    json.dump(genresDict, f, indent=4)


# Data Processing Platforms

In [4]:
import json
with open("platforms_and_types.json", 'r') as f:
    platformsTypeDict = json.load(f)
            


In [9]:
#Map the platforms to the type
def map_platform_to_genre(name):
    platforms = platformsTypeDict['consoles']
    types = platformsTypeDict['genres']
    for platform in platforms:
        if name == platform["name"]:
            for type in types:
                if platform["genres"] == type["id"]:
                    return type["name"]

def map_platforms_to_genre(platforms):
    return [map_platform_to_genre(platform) for platform in platforms]

games_platforms = games_clean.copy()
games_platforms['Realease_Year'] = pd.to_datetime(games_platforms['Release_Date']).dt.year
games_platforms = games_platforms[games_platforms['Realease_Year'] <= 2023]
games_platforms_explode = games_platforms.explode('Platforms')
games_platforms_explode['Platform_Type'] = games_platforms_explode['Platforms'].apply(lambda x: map_platform_to_genre(x))
games_platforms_groupBy = games_platforms_explode.groupby(['Realease_Year', 'Platform_Type', 'Platforms'])



In [10]:
import math

def aggregate_Game_ElemPlatforms(group):									
    return [{'Title' : elem['Title'],
             'Release_Date' : elem['Release_Date'].strftime("%Y-%m-%d"),
			 'Developers' : elem['Developers'],
			 'Summary' : "" if pd.isna(elem['Summary']) else elem['Summary'],
			 'Platforms' : elem['Platforms'],
			 'Genres' : elem['Genres'],
			 'Rating' : -1.0 if math.isnan(elem['Rating']) else elem['Rating'],
			 'Plays' : elem['Plays'],
			 'Playing' : elem['Playing'],
			 'Backlogs' : elem['Backlogs'],
			 'Wishlist' : elem['Wishlist'],
			 'Reviews' : elem['Reviews'],
             'Platform_Type' : elem['Platform_Type']
			 } for _, elem in group.iterrows()]

games_platforms_groupBy = games_platforms_groupBy.apply(aggregate_Game_ElemPlatforms)


In [13]:
from collections import defaultdict
# transform the groupby object to a json
platformsDict = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))

for (year, platform_type, platform), listElem in games_platforms_groupBy.items():
    platformsDict[year][platform_type][platform] = listElem

with open("games_by_year_and_platforms.json", 'w') as f:
    json.dump(platformsDict, f, indent=4)

# Data Processing Developers

In [32]:
import math

def aggregate_Game_ElemDevelopers(group):									
    return [{'Title' : elem['Title'],
             'Release_Date' : elem['Release_Date'].strftime("%Y-%m-%d"),
			 'Developers' : elem['Developers'],
			 'Summary' : "" if pd.isna(elem['Summary']) else elem['Summary'],
			 'Platforms' : elem['Platforms'],
			 'Genres' : elem['Genres'],
			 'Rating' : -1.0 if math.isnan(elem['Rating']) else elem['Rating'],
			 'Plays' : elem['Plays'],
			 'Playing' : elem['Playing'],
			 'Backlogs' : elem['Backlogs'],
			 'Wishlist' : elem['Wishlist'],
			 'Reviews' : elem['Reviews']
			 } for _, elem in group.iterrows()]

# extract the year
games_withYear = games_clean
games_withYear['Realease_Year'] = pd.to_datetime(games_clean['Release_Date']).dt.year
#remove all the year geater than 2023
games_withYear = games_withYear[games_withYear['Realease_Year'] <= 2023]
# explode list of Developers
developers_exploded = games_withYear.explode('Developers')
# group by year and developers
devYear_groupBy = developers_exploded.groupby(['Realease_Year', 'Developers'])
devYear_groupBy = devYear_groupBy.apply(aggregate_Game_ElemDevelopers)

  devYear_groupBy = devYear_groupBy.apply(aggregate_Game_ElemDevelopers)


In [33]:
from collections import defaultdict

devDict = defaultdict(lambda: defaultdict(list))

for (year, dev), listElem in devYear_groupBy.items():
    devDict[year][dev] = listElem


In [34]:
import json
with open("games_by_year_and_developers.json", 'w') as f:
    json.dump(devDict, f, indent=4)