In [1]:
import numpy as np

import pandas as pd



In [2]:
# Loading in all the data frames
all_games = pd.read_csv('../data/raw data/all_games.csv')
genre_info = pd.read_csv('../data/raw data/genre_info.csv')
score_info = pd.read_csv('../data/raw data/score_info.csv')
release_info = pd.read_csv('../data/raw data/release_info.csv')

### Cleaning up all_games

In [3]:
# Clean up the title column

# Some names have ' in them so they surrounded with " ", the ones without ' in them are surrounded with ' '. The following code lets keep ' in the titles that have them while removing the extra characters
all_games['title'] = all_games['title'].str.replace('["','').str.replace('"]','').str.replace("['",'').str.replace("']",'')

In [4]:
# Clean up the link column

all_games['link'] = all_games['link'].str.replace("['",'').str.replace("']",'')

In [5]:
# Save clean data
all_games.to_csv('../data/clean data/all_games_clean.csv', index=False)

### Cleaning up genre_info 

In [6]:
# From 'genre_info', only keep 'Genre', 'Perspective', 'Gameplay', 'Misc', and 'Title' columns since they have sufficient entriesn to analyze
genre_info = genre_info[['Genre', 'Perspective', 'Gameplay', 'Misc', 'title']]

# Clean up the 'genre' column
genre_info['Genre'] = genre_info['Genre'].str.replace("'",'').str.replace("'",'').str.replace("[",'').str.replace("]",'')



In [7]:
# Clean up the 'Perspective' column
genre_info['Perspective'] = genre_info['Perspective'].str.replace("['",'').str.replace("']",'').str.replace("'",'')


In [8]:
# Clean up the 'Gameplay' column
genre_info['Gameplay'] = genre_info['Gameplay'].str.replace("['",'').str.replace("']",'').str.replace("'",'')

In [9]:
# Clean up the 'Misc' column
genre_info['Misc'] = genre_info['Misc'].str.replace("['",'').str.replace("']",'')

In [10]:
# Clean up the 'Title' column
genre_info['title'] = genre_info['title'].str.replace('["','').str.replace('"]','').str.replace("['",'').str.replace("']",'')


In [11]:
# Save clean data
genre_info.to_csv('../data/clean data/genre_info_clean.csv', index=False)

### Cleaning up score_info 

In [12]:
# Clean up the 'moby_score' column
score_info['moby_score'] = score_info['moby_score'].str.strip()
# In the 'moby_score' column there are some values that are not numbers, we will remove them with NaN
score_info['moby_score'] = pd.to_numeric(score_info['moby_score'], errors='coerce')

In [13]:
# Clean up the 'critics_score' column
score_info['critics_score'] = score_info['critics_score'].str.strip().str.replace('%', '')

# Some entries have refer to the amount of players who cllected the game, whcis it not wanted
# Turn entries that have the word 'players' to a NaN vlaues
score_info['critics_score'] = score_info['critics_score'].apply(
    lambda x: np.nan if isinstance(x, str) and 'players' in x else x
)

In [14]:
# Clean up the 'player_score' column
score_info['player_score'] = score_info['player_score'].str.replace('stars', '')


In [15]:
# Clean up the 'xbox_score' column
score_info['xbox_score'] = score_info['xbox_score'].str.replace('#', '')

In [16]:
# Create an average score column to get a general idea of the performance of the each game

# Turn each score column into a numeric type
score_info['critics_score'] = pd.to_numeric(score_info['critics_score'], errors='coerce')
score_info['player_score'] = pd.to_numeric(score_info['player_score'], errors='coerce')
score_info['moby_score'] = pd.to_numeric(score_info['moby_score'], errors='coerce')

# Player score is ranked out of 5, so we will multiply it by 2 to get a score out of 100
score_info['player_score'] = score_info['player_score'] * 20

# Moby_score is out of 10, so we will multiply it by 10
score_info['moby_score'] = score_info['moby_score'] * 10

# Create a new column that is the average of all the scores
score_info['average_score'] = round(score_info[['critics_score', 'player_score', 'moby_score']].mean(axis=1),2)


In [17]:
# Save clean data
score_info.to_csv('../data/clean data/score_info_clean.csv', index=False)

### Cleaning up release_info 

In [18]:
# Remove 'people' from the 'credits' column
release_info['credits'] = release_info['credits'].str.replace('people','')

# Turn 'Contribute' in the 'cerdits' column in NaN values
release_info['credits'] = release_info['credits'].replace('Contribute', np.nan)

In [19]:
# Convert the 'xbox_release_date' column to string type
release_info['xbox_release_date'] = release_info['xbox_release_date'].astype(str)

# Remove the .0 at the end of each date
release_info['xbox_release_date'] = release_info['xbox_release_date'].str.replace('.0', '')

# Turn 'nan' back into NaN values
release_info['xbox_release_date'] = release_info['xbox_release_date'].replace('nan', np.nan)

# Turn release date into an int type
release_info['xbox_release_date'] = release_info['xbox_release_date'].astype(float).astype('Int64')


In [20]:
# Clean the 'publishers' column
release_info['publishers'] = release_info['publishers'].str.replace("[",'').str.replace("]",'')

In [21]:
# Clean the 'developers' column
release_info['developers'] = release_info['developers'].str.replace("[",'').str.replace("]",'')

In [22]:
# Removing the word 'person' from the credits column
release_info['credits'] = release_info['credits'].apply(
    lambda x: np.nan if isinstance(x, str) and 'person' in x else x
)

# Remove and ','s that might be in the credits column
release_info['credits'] = release_info['credits'].str.replace(',', '')

# Turing credits into a float
release_info['credits'] = release_info['credits'].apply(
    lambda x: np.nan if isinstance(x, str) and 'k' in x else x
)

In [23]:
# Turning the credits column into an integer and ommit NaN values
release_info['credits'] = release_info['credits'].dropna().astype(int)

In [24]:
# Save clean data
release_info.to_csv('../data/clean data/release_info_clean.csv', index=False)