In [257]:
import pandas as pd
import re

In [258]:
df = pd.read_csv('votes_only_raw_data.csv')

In [259]:
df.head()

Unnamed: 0,Rawg_id,Game,Rawg_rating,Rawg_vote_count,Rating_Bdown,Genres,Released
0,3498,Grand Theft Auto V,4.48,2750,"[{'id': 5, 'title': 'exceptional', 'count': 16...","['Action', 'Shooter']",2013-09-17
1,4200,Portal 2,4.61,2401,"[{'id': 5, 'title': 'exceptional', 'count': 16...","['Shooter', 'Puzzle']",2011-04-19
2,3328,The Witcher 3: Wild Hunt,4.68,2543,"[{'id': 5, 'title': 'exceptional', 'count': 19...",['RPG'],2015-05-18
3,5679,The Elder Scrolls V: Skyrim,4.39,2006,"[{'id': 5, 'title': 'exceptional', 'count': 10...","['Action', 'RPG']",2011-11-11
4,12020,Left 4 Dead 2,4.08,1424,"[{'id': 4, 'title': 'recommended', 'count': 76...","['Action', 'Shooter']",2009-11-17


In [260]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11661 entries, 0 to 11660
Data columns (total 7 columns):
Rawg_id            11661 non-null int64
Game               11661 non-null object
Rawg_rating        11661 non-null float64
Rawg_vote_count    11661 non-null int64
Rating_Bdown       11661 non-null object
Genres             11661 non-null object
Released           11433 non-null object
dtypes: float64(1), int64(2), object(4)
memory usage: 637.8+ KB


In [261]:
df.describe()

Unnamed: 0,Rawg_id,Rawg_rating,Rawg_vote_count
count,11661.0,11661.0,11661.0
mean,34172.80259,2.333858,50.85087
std,56050.987265,1.673688,142.050783
min,2.0,0.0,3.0
25%,10784.0,0.0,5.0
50%,19522.0,3.0,10.0
75%,37897.0,3.76,32.0
max,394092.0,5.0,2750.0


In [262]:
# check for curropted data
df.isna().any()

Rawg_id            False
Game               False
Rawg_rating        False
Rawg_vote_count    False
Rating_Bdown       False
Genres             False
Released            True
dtype: bool

### cleaning the dataset before adjusting the columns

In [263]:
# drop games that do not contain genres
df = df[df['Genres'] != '[]']

# check which game is missing release date and manually fill those with over 30 votes
def add_release_dates(df):
    '''
    This function fills specific observations with specific release dates.
    Input:
        df: Original raw data dataframe
    '''
    # star wars
    df.at[15931, 'Released'] = '2001-12-19'
    # god eater resurrection
    df.at[792, 'Released'] = '2015-10-29' 
    # deep rock galactic
    df.at[1218, 'Released'] = '2018-02-28'
    # aliens: Colonial Marines Collection
    df.at[1990, 'Released'] = '2013-02-12'
    # terminator Salvation
    df.at[4017, 'Released'] = '2009-05-01'
    # kenshi
    df.at[3150, 'Released'] = '2018-12-06'
    # Sam & Max 303: They Stole Max's Brain!
    df.at[807, 'Released'] = '2010-06-22'
    # Sam & Max 303: Beyond the Alley of the Dolls
    df.at[812, 'Released'] = '2010-07-20'
    # Sam & Max 303: The Penal Zone
    df.at[814, 'Released'] = '2010-04-02'
    # Sam & Max 303: The City That Dares Not Sleep
    df.at[811, 'Released'] = '2010-08-30'
    # Sam & Max 303:  The Tomb of Sammun-Mak
    df.at[813, 'Released'] = '2010-05-18'
    # Sword of the Stars: The Pit
    df.at[2277, 'Released'] = '2013-02-21'
    # SpellForce 2 Anniversary Edition
    df.at[2294, 'Released'] = '2017-04-12'
    # raft
    df.at[4083, 'Released'] = '2018-05-23'
    # Hector: Episode 2
    df.at[1133, 'Released'] = '2010-06-3'
    
    # most of the games with higher vote count that are missing release date are closed/suspended
    # to save time I'll fill the values with 'Suspended'
    df.fillna('Suspended', inplace=True)
    return df

df = add_release_dates(df)

print(f'{df.isna().any()}\n')
df.info()

Rawg_id            False
Game               False
Rawg_rating        False
Rawg_vote_count    False
Rating_Bdown       False
Genres             False
Released           False
dtype: bool

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11020 entries, 0 to 15931
Data columns (total 7 columns):
Rawg_id            11020 non-null object
Game               11020 non-null object
Rawg_rating        11020 non-null object
Rawg_vote_count    11020 non-null object
Rating_Bdown       11020 non-null object
Genres             11020 non-null object
Released           11020 non-null object
dtypes: object(7)
memory usage: 1008.8+ KB


In [None]:
cleaned_df = df.copy(deep=True)
cleaned_df.to_csv('cleaner_raw_')

In [130]:
def add_genre_columns(df, column):
    # create one big string with all the genres
    big_string = ' '.join(df[column])

    # replace all the non letters to space
    big_string = re.sub('\W+', ' ', big_string)

    # split on the white spaces
    list_strings = list(set(big_string.split(' ')))[1:-1]    
    
    #create dataframe with the column names
    column_df = pd.DataFrame(columns=list_strings)
    
    # join the original df with the new df
    df = df.join(column_df, how='left')
    
    # add function that uses the list strings to fill the values
    return df

df = add_genre_columns(df, 'Genres')

Unnamed: 0,Rawg_id,Game,Rawg_rating,Rawg_vote_count,Rating_Bdown,Genres,Released,Arcade,Casual,Games,...,Massively,Shooter,Educational,Adventure,Board,Family,Strategy,Card,Puzzle,Sports
0,3498,Grand Theft Auto V,4.48,2750,"[{'id': 5, 'title': 'exceptional', 'count': 16...","['Action', 'Shooter']",2013-09-17,,,,...,,,,,,,,,,
1,4200,Portal 2,4.61,2401,"[{'id': 5, 'title': 'exceptional', 'count': 16...","['Shooter', 'Puzzle']",2011-04-19,,,,...,,,,,,,,,,
2,3328,The Witcher 3: Wild Hunt,4.68,2543,"[{'id': 5, 'title': 'exceptional', 'count': 19...",['RPG'],2015-05-18,,,,...,,,,,,,,,,
3,5679,The Elder Scrolls V: Skyrim,4.39,2006,"[{'id': 5, 'title': 'exceptional', 'count': 10...","['Action', 'RPG']",2011-11-11,,,,...,,,,,,,,,,
4,12020,Left 4 Dead 2,4.08,1424,"[{'id': 4, 'title': 'recommended', 'count': 76...","['Action', 'Shooter']",2009-11-17,,,,...,,,,,,,,,,


In [66]:
def create_genre_list(series):
    genre_list = []
    for genres in series:
        genre_list.append(genres)
    return genre_list

a = create_genre_list(genres_series)
type(a[-1])

str

In [70]:
b = 


'['

In [59]:
b

"['Strategy', 'Simulation', 'Sports']"