There are a number of columns in the Steam data set which will need to be clearned up before they can be fed to the algorithms.

In particular, number strings need to be converted to int or float format, yes/no data needs to be made numerical(0 or 1), catagories need to be made numerical, and unusual items need to be handled somehow.

Later, some games may be rejected for having insufficient data

In [14]:
import numpy as np
import pandas as pd
import copy

from dateutil.parser import *

# various options in pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 25)
pd.set_option('display.precision', 3)

In [15]:
df = pd.read_csv('SteamStats.csv')
df

Unnamed: 0.1,Unnamed: 0,ID_num,Name,applicationCategory,Developer,Publisher,OS_windows,OS_mac,OS_linux,SteamPlay,Release_Date,worstRating,bestRating,ratingValue,reviewCount,Price,Lowest_Price,Max_Sale,genres,controller_support,metacritic_score,community_visible_stats,workshop_visible,releasestate,Achievement_Languages,languages_num,community_hub_visible,store_tags,owners,owners_unc,players_total,owners_played_percent,players_2_weeks,players_2_weeks_percent,median_total_playtime,average_total_playtime,Packages,DLCs,Depots
0,0,10,Counter-Strike,Game,Valve,Valve,True,True,True,True,2000-11-01 00:00:00,0.0,100.0,97.74,93913.0,9.99,2.49,75.0,['Action'],,88.0,Yes,,,[],0,Yes,"['Action', 'FPS', 'Multiplayer', 'Shooter', 'C...",13440970,96036,9426812,70.13%,361920,2.69%,6.9 hours,185.9 hours,65,0,24.0
1,1,1002,Rag Doll Kung Fu,Game,Mark Healey,Mark Healey,,,,False,2005-10-12 00:00:00,,,,,9.99,2.49,75.0,['Indie'],,69.0,,,,[],0,Yes,"['Indie', 'Fighting']",39347,5282,11878,30.19%,0,0%,23 minutes,1.1 hours,4,1,2.0
2,2,10090,Call of Duty: World at War,Game,Treyarch,Activision,,,,False,2008-11-18 00:00:00,0.0,100.0,92.69,16971.0,19.99,9.79,51.0,['Action'],,83.0,Yes,,,[],0,Yes,"['Zombies', 'World War II', 'FPS', 'Action', '...",1673741,34382,1423924,85.07%,106163,6.34%,12.5 hours,42.1 hours,41,0,7.0
3,3,10130,TimeShift,Game,Saber Interactive,Activision,,,,False,2007-10-30 00:00:00,0.0,100.0,76.69,653.0,19.99,4.99,75.0,['Action'],,71.0,Yes,,,[],0,Yes,"['Action', 'FPS', 'Time Manipulation', 'Sci-fi...",134003,9746,55494,41.41%,1172,0.87%,1.1 hours,3.1 hours,6,0,1.0
4,4,10180,Call of Duty: Modern Warfare 2,Game,Infinity Ward,Activision,,,,False,2009-11-12 00:00:00,0.0,100.0,90.93,26992.0,19.99,9.79,51.0,['Action'],,86.0,Yes,,,"['english', 'french', 'german', 'italian', 'ja...",8,Yes,"['Action', 'FPS', 'Multiplayer', 'Shooter', 'F...",5246171,60607,4686587,89.33%,70157,1.34%,9.4 hours,23.5 hours,37,0,18.0
5,5,10220,Postal 3,Game,Trashmasters,Akella,,,,False,2011-12-21 00:00:00,,,,,11.99,4.79,60.0,['Action'],,24.0,Yes,,,"['english', 'russian']",2,Yes,"['Action', 'Dark Humor', 'Gore', 'Open World',...",78302,8137,69012,88.14%,2491,3.18%,2.1 hours,7.1 hours,3,0,4.0
6,6,102400,Vertex Dispenser,Game,Michael Brough,Michael Brough,True,True,False,True,2011-06-10 00:00:00,,,,,9.99,4.99,50.0,"['Action', 'Indie', 'Strategy']",,70.0,Yes,,,[],0,Yes,"['Strategy', 'Action', 'Indie']",33037,4840,12992,39.33%,0,0%,22 minutes,31 minutes,5,1,2.0
7,7,10250,PT Boats: Knights of the Sea,Game,Studio4,Akella,,,,False,2009-01-30 00:00:00,,,,,6.99,2.49,75.0,['Simulation'],,,,,,[],0,Yes,['Simulation'],11507,2856,9837,85.49%,0,0%,1.4 hours,2.7 hours,4,0,3.0
8,8,102500,Kingdoms of Amalur: Reckoning™,Game,Big Huge Games,38 Studios,,,,False,2012-02-07 00:00:00,0.0,100.0,87.28,7748.0,19.99,4.39,78.0,"['Action', 'RPG']",partial,81.0,Yes,,,"['english', 'french', 'german', 'italian', 'sp...",5,Yes,"['RPG', 'Fantasy', 'Open World', 'Singleplayer...",891851,27437,746970,83.76%,19465,2.18%,9.2 hours,26.6 hours,6,4,13.0
9,9,10260,PT Boats: South Gambit,Game,studio4,Akella,,,,False,2010-12-10 00:00:00,,,,,6.99,2.49,75.0,['Simulation'],,,,,,[],0,Yes,['Simulation'],11322,2833,6867,60.65%,0,0%,34 minutes,2.0 hours,4,0,3.0


strip commas, % out of numerical columns, and ' hours' from playtime columns

In [16]:
for column in ['median_total_playtime','average_total_playtime']:
    df[column] = df[column].map(lambda x: str(x).strip(" hours"))
#note: playtime was in hours!
df.head()

Unnamed: 0.1,Unnamed: 0,ID_num,Name,applicationCategory,Developer,Publisher,OS_windows,OS_mac,OS_linux,SteamPlay,Release_Date,worstRating,bestRating,ratingValue,reviewCount,Price,Lowest_Price,Max_Sale,genres,controller_support,metacritic_score,community_visible_stats,workshop_visible,releasestate,Achievement_Languages,languages_num,community_hub_visible,store_tags,owners,owners_unc,players_total,owners_played_percent,players_2_weeks,players_2_weeks_percent,median_total_playtime,average_total_playtime,Packages,DLCs,Depots
0,0,10,Counter-Strike,Game,Valve,Valve,True,True,True,True,2000-11-01 00:00:00,0.0,100.0,97.74,93913.0,9.99,2.49,75.0,['Action'],,88.0,Yes,,,[],0,Yes,"['Action', 'FPS', 'Multiplayer', 'Shooter', 'C...",13440970,96036,9426812,70.13%,361920,2.69%,6.9,185.9,65,0,24.0
1,1,1002,Rag Doll Kung Fu,Game,Mark Healey,Mark Healey,,,,False,2005-10-12 00:00:00,,,,,9.99,2.49,75.0,['Indie'],,69.0,,,,[],0,Yes,"['Indie', 'Fighting']",39347,5282,11878,30.19%,0,0%,23 minute,1.1,4,1,2.0
2,2,10090,Call of Duty: World at War,Game,Treyarch,Activision,,,,False,2008-11-18 00:00:00,0.0,100.0,92.69,16971.0,19.99,9.79,51.0,['Action'],,83.0,Yes,,,[],0,Yes,"['Zombies', 'World War II', 'FPS', 'Action', '...",1673741,34382,1423924,85.07%,106163,6.34%,12.5,42.1,41,0,7.0
3,3,10130,TimeShift,Game,Saber Interactive,Activision,,,,False,2007-10-30 00:00:00,0.0,100.0,76.69,653.0,19.99,4.99,75.0,['Action'],,71.0,Yes,,,[],0,Yes,"['Action', 'FPS', 'Time Manipulation', 'Sci-fi...",134003,9746,55494,41.41%,1172,0.87%,1.1,3.1,6,0,1.0
4,4,10180,Call of Duty: Modern Warfare 2,Game,Infinity Ward,Activision,,,,False,2009-11-12 00:00:00,0.0,100.0,90.93,26992.0,19.99,9.79,51.0,['Action'],,86.0,Yes,,,"['english', 'french', 'german', 'italian', 'ja...",8,Yes,"['Action', 'FPS', 'Multiplayer', 'Shooter', 'F...",5246171,60607,4686587,89.33%,70157,1.34%,9.4,23.5,37,0,18.0


In [17]:
for column in ['owners', 'owners_unc','players_total','owners_played_percent',\
               'players_2_weeks','players_2_weeks_percent','median_total_playtime',\
               'average_total_playtime','Packages','DLCs','Depots']:
    
    df[column] = df[column].map(lambda x: str(x).strip("%"))
    df[column] = df[column].map(lambda x: str(x).replace(',', ''))

df.head(15)

Unnamed: 0.1,Unnamed: 0,ID_num,Name,applicationCategory,Developer,Publisher,OS_windows,OS_mac,OS_linux,SteamPlay,Release_Date,worstRating,bestRating,ratingValue,reviewCount,Price,Lowest_Price,Max_Sale,genres,controller_support,metacritic_score,community_visible_stats,workshop_visible,releasestate,Achievement_Languages,languages_num,community_hub_visible,store_tags,owners,owners_unc,players_total,owners_played_percent,players_2_weeks,players_2_weeks_percent,median_total_playtime,average_total_playtime,Packages,DLCs,Depots
0,0,10,Counter-Strike,Game,Valve,Valve,True,True,True,True,2000-11-01 00:00:00,0.0,100.0,97.74,93913.0,9.99,2.49,75.0,['Action'],,88.0,Yes,,,[],0,Yes,"['Action', 'FPS', 'Multiplayer', 'Shooter', 'C...",13440970,96036,9426812,70.13,361920,2.69,6.9,185.9,65,0,24.0
1,1,1002,Rag Doll Kung Fu,Game,Mark Healey,Mark Healey,,,,False,2005-10-12 00:00:00,,,,,9.99,2.49,75.0,['Indie'],,69.0,,,,[],0,Yes,"['Indie', 'Fighting']",39347,5282,11878,30.19,0,0.0,23 minute,1.1,4,1,2.0
2,2,10090,Call of Duty: World at War,Game,Treyarch,Activision,,,,False,2008-11-18 00:00:00,0.0,100.0,92.69,16971.0,19.99,9.79,51.0,['Action'],,83.0,Yes,,,[],0,Yes,"['Zombies', 'World War II', 'FPS', 'Action', '...",1673741,34382,1423924,85.07,106163,6.34,12.5,42.1,41,0,7.0
3,3,10130,TimeShift,Game,Saber Interactive,Activision,,,,False,2007-10-30 00:00:00,0.0,100.0,76.69,653.0,19.99,4.99,75.0,['Action'],,71.0,Yes,,,[],0,Yes,"['Action', 'FPS', 'Time Manipulation', 'Sci-fi...",134003,9746,55494,41.41,1172,0.87,1.1,3.1,6,0,1.0
4,4,10180,Call of Duty: Modern Warfare 2,Game,Infinity Ward,Activision,,,,False,2009-11-12 00:00:00,0.0,100.0,90.93,26992.0,19.99,9.79,51.0,['Action'],,86.0,Yes,,,"['english', 'french', 'german', 'italian', 'ja...",8,Yes,"['Action', 'FPS', 'Multiplayer', 'Shooter', 'F...",5246171,60607,4686587,89.33,70157,1.34,9.4,23.5,37,0,18.0
5,5,10220,Postal 3,Game,Trashmasters,Akella,,,,False,2011-12-21 00:00:00,,,,,11.99,4.79,60.0,['Action'],,24.0,Yes,,,"['english', 'russian']",2,Yes,"['Action', 'Dark Humor', 'Gore', 'Open World',...",78302,8137,69012,88.14,2491,3.18,2.1,7.1,3,0,4.0
6,6,102400,Vertex Dispenser,Game,Michael Brough,Michael Brough,True,True,False,True,2011-06-10 00:00:00,,,,,9.99,4.99,50.0,"['Action', 'Indie', 'Strategy']",,70.0,Yes,,,[],0,Yes,"['Strategy', 'Action', 'Indie']",33037,4840,12992,39.33,0,0.0,22 minute,31 minute,5,1,2.0
7,7,10250,PT Boats: Knights of the Sea,Game,Studio4,Akella,,,,False,2009-01-30 00:00:00,,,,,6.99,2.49,75.0,['Simulation'],,,,,,[],0,Yes,['Simulation'],11507,2856,9837,85.49,0,0.0,1.4,2.7,4,0,3.0
8,8,102500,Kingdoms of Amalur: Reckoning™,Game,Big Huge Games,38 Studios,,,,False,2012-02-07 00:00:00,0.0,100.0,87.28,7748.0,19.99,4.39,78.0,"['Action', 'RPG']",partial,81.0,Yes,,,"['english', 'french', 'german', 'italian', 'sp...",5,Yes,"['RPG', 'Fantasy', 'Open World', 'Singleplayer...",891851,27437,746970,83.76,19465,2.18,9.2,26.6,6,4,13.0
9,9,10260,PT Boats: South Gambit,Game,studio4,Akella,,,,False,2010-12-10 00:00:00,,,,,6.99,2.49,75.0,['Simulation'],,,,,,[],0,Yes,['Simulation'],11322,2833,6867,60.65,0,0.0,34 minute,2.0,4,0,3.0


### convert Release_Date column to a datetime object

In [18]:

def toDT(d):
    #print(d)
    if d == np.nan or d == 'nan': return np.nan
    try:
        date = parse(d)
        return date
    except (ValueError,TypeError):
        print("unparsable data:",d)
        return np.nan

df['Release_Date'] = df['Release_Date'].map(toDT)
df.head()
#note, can't re-run this without the code above, as it changes data in place

unparsable data: nan
unparsable data: nan
unparsable data: nan
unparsable data: nan
unparsable data: nan
unparsable data: nan
unparsable data: nan
unparsable data: nan
unparsable data: nan
unparsable data: nan
unparsable data: Coming Soon
unparsable data: TBA
unparsable data: Fall 2017
unparsable data: To Be Announced.
unparsable data: To Be Announced
unparsable data: To Be Announced
unparsable data: nan
unparsable data: Q2 2017
unparsable data: TBD
unparsable data: Coming soon
unparsable data: Q4 2017
unparsable data: Fall 2017
unparsable data: Coming soon
unparsable data: TBA
unparsable data: Coming Soon
unparsable data: ~2017
unparsable data: Coming Soon
unparsable data: nan
unparsable data: Late 2017
unparsable data: Q3 2017
unparsable data: Q1 2018
unparsable data: Fall 2017 - Early Access
unparsable data: COMING SOON
unparsable data: TBA
unparsable data: nan
unparsable data: Coming Soon!
unparsable data: Spring 2018
unparsable data: Coming Soon
unparsable data: TBA
unparsable dat

Unnamed: 0.1,Unnamed: 0,ID_num,Name,applicationCategory,Developer,Publisher,OS_windows,OS_mac,OS_linux,SteamPlay,Release_Date,worstRating,bestRating,ratingValue,reviewCount,Price,Lowest_Price,Max_Sale,genres,controller_support,metacritic_score,community_visible_stats,workshop_visible,releasestate,Achievement_Languages,languages_num,community_hub_visible,store_tags,owners,owners_unc,players_total,owners_played_percent,players_2_weeks,players_2_weeks_percent,median_total_playtime,average_total_playtime,Packages,DLCs,Depots
0,0,10,Counter-Strike,Game,Valve,Valve,True,True,True,True,2000-11-01,0.0,100.0,97.74,93913.0,9.99,2.49,75.0,['Action'],,88.0,Yes,,,[],0,Yes,"['Action', 'FPS', 'Multiplayer', 'Shooter', 'C...",13440970,96036,9426812,70.13,361920,2.69,6.9,185.9,65,0,24.0
1,1,1002,Rag Doll Kung Fu,Game,Mark Healey,Mark Healey,,,,False,2005-10-12,,,,,9.99,2.49,75.0,['Indie'],,69.0,,,,[],0,Yes,"['Indie', 'Fighting']",39347,5282,11878,30.19,0,0.0,23 minute,1.1,4,1,2.0
2,2,10090,Call of Duty: World at War,Game,Treyarch,Activision,,,,False,2008-11-18,0.0,100.0,92.69,16971.0,19.99,9.79,51.0,['Action'],,83.0,Yes,,,[],0,Yes,"['Zombies', 'World War II', 'FPS', 'Action', '...",1673741,34382,1423924,85.07,106163,6.34,12.5,42.1,41,0,7.0
3,3,10130,TimeShift,Game,Saber Interactive,Activision,,,,False,2007-10-30,0.0,100.0,76.69,653.0,19.99,4.99,75.0,['Action'],,71.0,Yes,,,[],0,Yes,"['Action', 'FPS', 'Time Manipulation', 'Sci-fi...",134003,9746,55494,41.41,1172,0.87,1.1,3.1,6,0,1.0
4,4,10180,Call of Duty: Modern Warfare 2,Game,Infinity Ward,Activision,,,,False,2009-11-12,0.0,100.0,90.93,26992.0,19.99,9.79,51.0,['Action'],,86.0,Yes,,,"['english', 'french', 'german', 'italian', 'ja...",8,Yes,"['Action', 'FPS', 'Multiplayer', 'Shooter', 'F...",5246171,60607,4686587,89.33,70157,1.34,9.4,23.5,37,0,18.0


In [19]:
# EXPERIMENT: DROPPING games with significantly less information
df_NaN_dropped = df.dropna(axis=0,thresh = 36)

In [20]:
df_NaN_dropped.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1688 entries, 0 to 6995
Data columns (total 39 columns):
Unnamed: 0                 1688 non-null int64
ID_num                     1688 non-null int64
Name                       1688 non-null object
applicationCategory        1688 non-null object
Developer                  1684 non-null object
Publisher                  1683 non-null object
OS_windows                 1687 non-null object
OS_mac                     1687 non-null object
OS_linux                   1687 non-null object
SteamPlay                  1688 non-null bool
Release_Date               1685 non-null datetime64[ns]
worstRating                1688 non-null float64
bestRating                 1688 non-null float64
ratingValue                1688 non-null float64
reviewCount                1688 non-null float64
Price                      1676 non-null object
Lowest_Price               1688 non-null float64
Max_Sale                   1657 non-null float64
genres             

In [21]:
df_NaN_dropped.head(20)

Unnamed: 0.1,Unnamed: 0,ID_num,Name,applicationCategory,Developer,Publisher,OS_windows,OS_mac,OS_linux,SteamPlay,Release_Date,worstRating,bestRating,ratingValue,reviewCount,Price,Lowest_Price,Max_Sale,genres,controller_support,metacritic_score,community_visible_stats,workshop_visible,releasestate,Achievement_Languages,languages_num,community_hub_visible,store_tags,owners,owners_unc,players_total,owners_played_percent,players_2_weeks,players_2_weeks_percent,median_total_playtime,average_total_playtime,Packages,DLCs,Depots
0,0,10,Counter-Strike,Game,Valve,Valve,True,True,True,True,2000-11-01,0.0,100.0,97.74,93913.0,9.99,2.49,75.0,['Action'],,88.0,Yes,,,[],0,Yes,"['Action', 'FPS', 'Multiplayer', 'Shooter', 'C...",13440970,96036,9426812,70.13,361920,2.69,6.9,185.9,65,0,24.0
14,14,102840,Shank 2,Game,Klei Entertainment,Klei Entertainment,True,True,False,True,2012-02-07,0.0,100.0,81.91,1553.0,9.99,1.49,85.0,"['Action', 'Adventure', 'Indie']",full,72.0,Yes,,,[],0,Yes,"['Action', ""Beat 'em up"", 'Indie', 'Adventure'...",602973,22567,284676,47.21,3981,0.66,1.5,3.7,8,1,3.0
18,18,104900,ORION: Prelude,Game,"Trek Industries, Inc","Trek Industries, Inc",True,False,False,False,2013-04-16,0.0,100.0,74.23,29186.0,0.99,0.49,51.0,"['Action', 'Adventure', 'Indie', 'RPG']",full,,Yes,Yes,,[],0,Yes,"['Dinosaurs', 'Action', 'FPS', 'Multiplayer', ...",2374567,40918,1787700,75.29,15405,0.65,1.6,3.9,16,2,7.0
19,19,10500,Empire: Total War,Game,The Creative Assembly,SEGA,True,True,True,True,2009-03-03,0.0,100.0,90.97,13447.0,19.99,3.74,90.0,['Strategy'],,90.0,Yes,,,"['czech', 'english', 'french', 'german', 'ital...",8,Yes,"['Strategy', 'Historical', 'Military', 'Grand ...",3614004,50404,3145178,87.03,107091,2.96,22.9,106.4,111,12,34.0
24,24,105600,Terraria,Game,Re-Logic,Re-Logic,True,True,True,True,2011-05-16,0.0,100.0,96.96,187414.0,9.99,1.99,80.0,"['Action', 'Adventure', 'Indie', 'RPG']",full,83.0,Yes,,,[],0,Yes,"['Sandbox', 'Adventure', 'Survival', '2D', 'Mu...",8895254,78567,8477097,95.3,776179,8.73,23.4,79.7,10,1,4.0
29,29,107410,Arma 3,Game,Bohemia Interactive,Bohemia Interactive,True,False,False,False,2013-09-12,0.0,100.0,89.97,86745.0,39.99,13.59,66.0,"['Action', 'Simulation', 'Strategy']",partial,74.0,Yes,Yes,released,"['czech', 'english', 'french', 'german', 'ital...",11,Yes,"['Simulation', 'Military', 'Multiplayer', 'Tac...",3343399,48496,3246331,97.1,515783,15.43,33.3,182.8,47,18,37.0
32,32,108600,Project Zomboid,Game,The Indie Stone,The Indie Stone,True,True,True,True,2013-11-08,0.0,100.0,87.23,15028.0,14.99,8.99,40.0,"['Indie', 'RPG', 'Simulation', 'Early Access']",partial,,Yes,Yes,released,[],0,Yes,"['Survival', 'Zombies', 'Open World', 'Sandbox...",802720,23836,767827,95.65,43059,5.36,6.2,21.0,6,1,8.0
42,42,111600,Serious Sam Double D XXL,Game,Mommy's Best Games,Devolver Digital,True,False,False,False,2011-08-30,0.0,100.0,82.02,527.0,9.99,0.99,90.0,"['Action', 'Indie']",partial,,Yes,,,['english'],1,Yes,"['Action', 'Indie', 'Platformer', 'Comedy', 'S...",540375,21365,115684,21.41,2313,0.43,24 minute,1.5,14,1,4.0
49,49,113020,Monaco,Game,Pocketwatch Games,Pocketwatch Games,True,True,True,True,2013-04-24,0.0,100.0,90.44,6483.0,5.99 at -60%,1.34,91.0,"['Action', 'Adventure', 'Casual', 'Indie', 'St...",full,83.0,Yes,Yes,released,['english'],1,Yes,"['Co-op', 'Stealth', 'Indie', 'Heist', 'Local ...",1337805,30751,976256,72.97,12992,0.97,1.7,4.0,11,3,6.0
50,50,113200,The Binding of Isaac,Game,Edmund McMillen and Florian Himsl,Edmund McMillen,True,True,False,True,2011-09-28,0.0,100.0,95.87,42185.0,4.99,0.49,90.0,"['Action', 'Adventure', 'Indie', 'RPG']",,84.0,Yes,,,[],0,Yes,"['Rogue-like', 'Indie', 'Replay Value', 'Diffi...",3003583,50222,2703203,90.0,55077,1.83,6.3,31.2,11,4,7.0



## Convert simply multi-entry catagoricals to numerical form (e.g NaN=0,True=1,False=2)

In [22]:
#n=copy.deepcopy(df)
df.OS_windows.unique()
df.SteamPlay.unique()

def col_to_codes(dataframe,column_list):
    for column in column_list:
        print(column)
        print(dataframe[column].unique())
        dataframe[column] = pd.Categorical(dataframe[column]).codes
        print(dataframe[column].unique())
    return

In [23]:
to_codes = ['applicationCategory','OS_windows','OS_mac','OS_linux','SteamPlay','community_visible_stats',
               'workshop_visible','community_hub_visible','controller_support',
           'releasestate']

col_to_codes(df,to_codes)

applicationCategory
['Game']
[0]
OS_windows
[True nan False]
[ 1 -1  0]
OS_mac
[True nan False]
[ 1 -1  0]
OS_linux
[True nan False]
[ 1 -1  0]
SteamPlay
[ True False]
[1 0]
community_visible_stats
['Yes' nan]
[ 0 -1]
workshop_visible
[nan 'Yes']
[-1  0]
community_hub_visible
['Yes' nan]
[ 0 -1]
controller_support
[nan 'partial' 'full' 'none']
[-1  2  0  1]
releasestate
[nan 'released' 'prerelease' 'preloadonly']
[-1  2  1  0]


In [24]:
df.head(12)

Unnamed: 0.1,Unnamed: 0,ID_num,Name,applicationCategory,Developer,Publisher,OS_windows,OS_mac,OS_linux,SteamPlay,Release_Date,worstRating,bestRating,ratingValue,reviewCount,Price,Lowest_Price,Max_Sale,genres,controller_support,metacritic_score,community_visible_stats,workshop_visible,releasestate,Achievement_Languages,languages_num,community_hub_visible,store_tags,owners,owners_unc,players_total,owners_played_percent,players_2_weeks,players_2_weeks_percent,median_total_playtime,average_total_playtime,Packages,DLCs,Depots
0,0,10,Counter-Strike,0,Valve,Valve,1,1,1,1,2000-11-01,0.0,100.0,97.74,93913.0,9.99,2.49,75.0,['Action'],-1,88.0,0,-1,-1,[],0,0,"['Action', 'FPS', 'Multiplayer', 'Shooter', 'C...",13440970,96036,9426812,70.13,361920,2.69,6.9,185.9,65,0,24.0
1,1,1002,Rag Doll Kung Fu,0,Mark Healey,Mark Healey,-1,-1,-1,0,2005-10-12,,,,,9.99,2.49,75.0,['Indie'],-1,69.0,-1,-1,-1,[],0,0,"['Indie', 'Fighting']",39347,5282,11878,30.19,0,0.0,23 minute,1.1,4,1,2.0
2,2,10090,Call of Duty: World at War,0,Treyarch,Activision,-1,-1,-1,0,2008-11-18,0.0,100.0,92.69,16971.0,19.99,9.79,51.0,['Action'],-1,83.0,0,-1,-1,[],0,0,"['Zombies', 'World War II', 'FPS', 'Action', '...",1673741,34382,1423924,85.07,106163,6.34,12.5,42.1,41,0,7.0
3,3,10130,TimeShift,0,Saber Interactive,Activision,-1,-1,-1,0,2007-10-30,0.0,100.0,76.69,653.0,19.99,4.99,75.0,['Action'],-1,71.0,0,-1,-1,[],0,0,"['Action', 'FPS', 'Time Manipulation', 'Sci-fi...",134003,9746,55494,41.41,1172,0.87,1.1,3.1,6,0,1.0
4,4,10180,Call of Duty: Modern Warfare 2,0,Infinity Ward,Activision,-1,-1,-1,0,2009-11-12,0.0,100.0,90.93,26992.0,19.99,9.79,51.0,['Action'],-1,86.0,0,-1,-1,"['english', 'french', 'german', 'italian', 'ja...",8,0,"['Action', 'FPS', 'Multiplayer', 'Shooter', 'F...",5246171,60607,4686587,89.33,70157,1.34,9.4,23.5,37,0,18.0
5,5,10220,Postal 3,0,Trashmasters,Akella,-1,-1,-1,0,2011-12-21,,,,,11.99,4.79,60.0,['Action'],-1,24.0,0,-1,-1,"['english', 'russian']",2,0,"['Action', 'Dark Humor', 'Gore', 'Open World',...",78302,8137,69012,88.14,2491,3.18,2.1,7.1,3,0,4.0
6,6,102400,Vertex Dispenser,0,Michael Brough,Michael Brough,1,1,0,1,2011-06-10,,,,,9.99,4.99,50.0,"['Action', 'Indie', 'Strategy']",-1,70.0,0,-1,-1,[],0,0,"['Strategy', 'Action', 'Indie']",33037,4840,12992,39.33,0,0.0,22 minute,31 minute,5,1,2.0
7,7,10250,PT Boats: Knights of the Sea,0,Studio4,Akella,-1,-1,-1,0,2009-01-30,,,,,6.99,2.49,75.0,['Simulation'],-1,,-1,-1,-1,[],0,0,['Simulation'],11507,2856,9837,85.49,0,0.0,1.4,2.7,4,0,3.0
8,8,102500,Kingdoms of Amalur: Reckoning™,0,Big Huge Games,38 Studios,-1,-1,-1,0,2012-02-07,0.0,100.0,87.28,7748.0,19.99,4.39,78.0,"['Action', 'RPG']",2,81.0,0,-1,-1,"['english', 'french', 'german', 'italian', 'sp...",5,0,"['RPG', 'Fantasy', 'Open World', 'Singleplayer...",891851,27437,746970,83.76,19465,2.18,9.2,26.6,6,4,13.0
9,9,10260,PT Boats: South Gambit,0,studio4,Akella,-1,-1,-1,0,2010-12-10,,,,,6.99,2.49,75.0,['Simulation'],-1,,-1,-1,-1,[],0,0,['Simulation'],11322,2833,6867,60.65,0,0.0,34 minute,2.0,4,0,3.0


## Split Store Tags into a series of columns, and convert to numerical

In [25]:
#n = copy.deepcopy(df)
df.genres
longlist = []
for l in df.genres:
    l = l.strip('[]')
    l = l.split(',')
    for item in l:
        item = item.strip('\' ')
        longlist.append(item)
longlist = pd.Series(longlist)
longlist.unique()

array(['Action', 'Indie', 'Strategy', 'Simulation', 'RPG', '', 'Adventure',
       'Casual', 'Early Access', 'Racing', 'Sports', 'Free to Play',
       'Massively Multiplayer'], dtype=object)

In [26]:
u = longlist.unique()
'Indie' in "'Action', 'RPG', 'Indie', 'Strategy', 'Adventure', 'Simulation'"

True

In [27]:
for genre in longlist.unique():
    df["genre_"+genre] = df["genres"].map(lambda x: genre in x)
df.drop('genre_',axis=1)
df.head(20)

Unnamed: 0.1,Unnamed: 0,ID_num,Name,applicationCategory,Developer,Publisher,OS_windows,OS_mac,OS_linux,SteamPlay,Release_Date,worstRating,bestRating,ratingValue,reviewCount,Price,Lowest_Price,Max_Sale,genres,controller_support,metacritic_score,community_visible_stats,workshop_visible,releasestate,Achievement_Languages,languages_num,community_hub_visible,store_tags,owners,owners_unc,players_total,owners_played_percent,players_2_weeks,players_2_weeks_percent,median_total_playtime,average_total_playtime,Packages,DLCs,Depots,genre_Action,genre_Indie,genre_Strategy,genre_Simulation,genre_RPG,genre_Adventure,genre_Casual,genre_Early Access,genre_Racing,genre_Sports,genre_Free to Play,genre_Massively Multiplayer
0,0,10,Counter-Strike,0,Valve,Valve,1,1,1,1,2000-11-01,0.0,100.0,97.74,93913.0,9.99,2.49,75.0,['Action'],-1,88.0,0,-1,-1,[],0,0,"['Action', 'FPS', 'Multiplayer', 'Shooter', 'C...",13440970,96036,9426812,70.13,361920,2.69,6.9,185.9,65,0,24.0,True,False,False,False,False,False,False,False,False,False,False,False
1,1,1002,Rag Doll Kung Fu,0,Mark Healey,Mark Healey,-1,-1,-1,0,2005-10-12,,,,,9.99,2.49,75.0,['Indie'],-1,69.0,-1,-1,-1,[],0,0,"['Indie', 'Fighting']",39347,5282,11878,30.19,0,0.0,23 minute,1.1,4,1,2.0,False,True,False,False,False,False,False,False,False,False,False,False
2,2,10090,Call of Duty: World at War,0,Treyarch,Activision,-1,-1,-1,0,2008-11-18,0.0,100.0,92.69,16971.0,19.99,9.79,51.0,['Action'],-1,83.0,0,-1,-1,[],0,0,"['Zombies', 'World War II', 'FPS', 'Action', '...",1673741,34382,1423924,85.07,106163,6.34,12.5,42.1,41,0,7.0,True,False,False,False,False,False,False,False,False,False,False,False
3,3,10130,TimeShift,0,Saber Interactive,Activision,-1,-1,-1,0,2007-10-30,0.0,100.0,76.69,653.0,19.99,4.99,75.0,['Action'],-1,71.0,0,-1,-1,[],0,0,"['Action', 'FPS', 'Time Manipulation', 'Sci-fi...",134003,9746,55494,41.41,1172,0.87,1.1,3.1,6,0,1.0,True,False,False,False,False,False,False,False,False,False,False,False
4,4,10180,Call of Duty: Modern Warfare 2,0,Infinity Ward,Activision,-1,-1,-1,0,2009-11-12,0.0,100.0,90.93,26992.0,19.99,9.79,51.0,['Action'],-1,86.0,0,-1,-1,"['english', 'french', 'german', 'italian', 'ja...",8,0,"['Action', 'FPS', 'Multiplayer', 'Shooter', 'F...",5246171,60607,4686587,89.33,70157,1.34,9.4,23.5,37,0,18.0,True,False,False,False,False,False,False,False,False,False,False,False
5,5,10220,Postal 3,0,Trashmasters,Akella,-1,-1,-1,0,2011-12-21,,,,,11.99,4.79,60.0,['Action'],-1,24.0,0,-1,-1,"['english', 'russian']",2,0,"['Action', 'Dark Humor', 'Gore', 'Open World',...",78302,8137,69012,88.14,2491,3.18,2.1,7.1,3,0,4.0,True,False,False,False,False,False,False,False,False,False,False,False
6,6,102400,Vertex Dispenser,0,Michael Brough,Michael Brough,1,1,0,1,2011-06-10,,,,,9.99,4.99,50.0,"['Action', 'Indie', 'Strategy']",-1,70.0,0,-1,-1,[],0,0,"['Strategy', 'Action', 'Indie']",33037,4840,12992,39.33,0,0.0,22 minute,31 minute,5,1,2.0,True,True,True,False,False,False,False,False,False,False,False,False
7,7,10250,PT Boats: Knights of the Sea,0,Studio4,Akella,-1,-1,-1,0,2009-01-30,,,,,6.99,2.49,75.0,['Simulation'],-1,,-1,-1,-1,[],0,0,['Simulation'],11507,2856,9837,85.49,0,0.0,1.4,2.7,4,0,3.0,False,False,False,True,False,False,False,False,False,False,False,False
8,8,102500,Kingdoms of Amalur: Reckoning™,0,Big Huge Games,38 Studios,-1,-1,-1,0,2012-02-07,0.0,100.0,87.28,7748.0,19.99,4.39,78.0,"['Action', 'RPG']",2,81.0,0,-1,-1,"['english', 'french', 'german', 'italian', 'sp...",5,0,"['RPG', 'Fantasy', 'Open World', 'Singleplayer...",891851,27437,746970,83.76,19465,2.18,9.2,26.6,6,4,13.0,True,False,False,False,True,False,False,False,False,False,False,False
9,9,10260,PT Boats: South Gambit,0,studio4,Akella,-1,-1,-1,0,2010-12-10,,,,,6.99,2.49,75.0,['Simulation'],-1,,-1,-1,-1,[],0,0,['Simulation'],11322,2833,6867,60.65,0,0.0,34 minute,2.0,4,0,3.0,False,False,False,True,False,False,False,False,False,False,False,False


In [28]:
df.columns

Index(['Unnamed: 0', 'ID_num', 'Name', 'applicationCategory', 'Developer',
       'Publisher', 'OS_windows', 'OS_mac', 'OS_linux', 'SteamPlay',
       'Release_Date', 'worstRating', 'bestRating', 'ratingValue',
       'reviewCount', 'Price', 'Lowest_Price', 'Max_Sale', 'genres',
       'controller_support', 'metacritic_score', 'community_visible_stats',
       'workshop_visible', 'releasestate', 'Achievement_Languages',
       'languages_num', 'community_hub_visible', 'store_tags', 'owners',
       'owners_unc', 'players_total', 'owners_played_percent',
       'players_2_weeks', 'players_2_weeks_percent', 'median_total_playtime',
       'average_total_playtime', 'Packages', 'DLCs', 'Depots', 'genre_Action',
       'genre_Indie', 'genre_Strategy', 'genre_Simulation', 'genre_RPG',
       'genre_Adventure', 'genre_Casual', 'genre_Early Access', 'genre_Racing',
       'genre_Sports', 'genre_Free to Play', 'genre_Massively Multiplayer'],
      dtype='object')

In [29]:
#convert these new columns to codes too:
to_codes = ['genre_Action',
       'genre_RPG', 'genre_Indie', 'genre_Strategy', 'genre_Adventure',
       'genre_Simulation', 'genre_Casual', 'genre_Early Access','controller_support',
       'genre_Racing', 'genre_Free to Play', 'genre_Sports',
       'genre_Massively Multiplayer']

col_to_codes(df,to_codes)

genre_Action
[ True False]
[1 0]
genre_RPG
[False  True]
[0 1]
genre_Indie
[False  True]
[0 1]
genre_Strategy
[False  True]
[0 1]
genre_Adventure
[False  True]
[0 1]
genre_Simulation
[False  True]
[0 1]
genre_Casual
[False  True]
[0 1]
genre_Early Access
[False  True]
[0 1]
controller_support
[-1  2  0  1]
[0 3 1 2]
genre_Racing
[False  True]
[0 1]
genre_Free to Play
[False  True]
[0 1]
genre_Sports
[False  True]
[0 1]
genre_Massively Multiplayer
[False  True]
[0 1]


#considering the same for store_tags, but the list is enormous!

In [30]:

n = copy.deepcopy(df)
n.store_tags
longlist = []
for l in n.store_tags:
    l = l.strip('[]')
    l = l.split(',')
    for item in l:
        item = item.strip('\' ')
        longlist.append(item)
longlist = pd.Series(longlist)
longlist.unique()

array(['Action', 'FPS', 'Multiplayer', 'Shooter', 'Classic', 'Team-Based',
       'Competitive', 'First-Person', 'Tactical', '"1990\'s"', 'e-sports',
       'PvP', 'Military', 'Strategy', 'Score Attack', 'Survival',
       'Assassin', '1980s', 'Ninja', 'Tower Defense', 'Indie', 'Fighting',
       'Zombies', 'World War II', 'Moddable', 'Co-op', 'Singleplayer',
       'War', 'Online Co-Op', 'Gore', 'Historical', 'Tanks',
       'Great Soundtrack', 'Adventure', 'Horror', 'Time Manipulation',
       'Sci-fi', 'Atmospheric', 'Bullet Time', 'Futuristic', 'Time Travel',
       'Story Rich', 'Linear', 'Dark Humor', 'Open World',
       'Third-Person Shooter', 'Funny', 'Violent', 'Third Person',
       'Comedy', 'Sandbox', 'Mature', 'Nudity', 'Simulation', 'RPG',
       'Fantasy', 'Action RPG', 'Loot', 'Magic', 'Character Customization',
       'Hack and Slash', 'Exploration', 'Controller', 'Crafting',
       'Female Protagonist', 'Cartoony', 'Arcade', 'Turn-Based Strategy',
       'Turn-Based'

In [32]:
len(df.columns)

51

## Create new DataFrame, and convert all numerical types to int or float

In [144]:
df_for_model1 = copy.deepcopy(df)

for column in ['Unnamed: 0', 'ID_num',\
       'OS_windows', 'OS_mac', 'OS_linux', 'SteamPlay',\
       'worstRating', 'bestRating', 'ratingValue',\
       'reviewCount', 'Lowest_Price', 'Max_Sale', \
       'metacritic_score', \
       'Price',\
       'languages_num', \
       'owners','owners_unc','players_total',  \
       'owners_played_percent', 'players_2_weeks', 'players_2_weeks_percent',\
       'median_total_playtime', 'average_total_playtime', 'Packages', 'DLCs',\
       'Depots',\
       'OS_windows','OS_mac','OS_linux','SteamPlay','community_visible_stats',\
       'workshop_visible','community_hub_visible']:
    #to put back in for NaN: 
    #for later: Price, workshop_visible,'releasestate', 
    #print(column)
    df_for_model1[column]=pd.to_numeric(df_for_model1[column],errors='coerce')

df_for_model1

Unnamed: 0.1,Unnamed: 0,ID_num,Name,applicationCategory,Developer,Publisher,OS_windows,OS_mac,OS_linux,SteamPlay,Release_Date,worstRating,bestRating,ratingValue,reviewCount,Price,Lowest_Price,Max_Sale,genres,controller_support,metacritic_score,community_visible_stats,workshop_visible,releasestate,Achievement_Languages,languages_num,community_hub_visible,store_tags,owners,owners_unc,players_total,owners_played_percent,players_2_weeks,players_2_weeks_percent,median_total_playtime,average_total_playtime,Packages,DLCs,Depots,genre_Action,genre_Indie,genre_Strategy,genre_Simulation,genre_RPG,genre_Adventure,genre_Casual,genre_Early Access,genre_Racing,genre_Sports,genre_Free to Play,genre_Massively Multiplayer
0,0,10,Counter-Strike,0,Valve,Valve,1,1,1,1,2000-11-01,0.0,100.0,97.74,93913.0,9.99,2.49,75.0,['Action'],0,88.0,0,-1,-1,[],0,0,"['Action', 'FPS', 'Multiplayer', 'Shooter', 'C...",1.344e+07,96036.0,9.427e+06,70.13,361920.0,2.69,6.9,185.9,65,0,24.0,1,0,0,0,0,0,0,0,0,0,0,0
1,1,1002,Rag Doll Kung Fu,0,Mark Healey,Mark Healey,-1,-1,-1,0,2005-10-12,,,,,9.99,2.49,75.0,['Indie'],0,69.0,-1,-1,-1,[],0,0,"['Indie', 'Fighting']",3.935e+04,5282.0,1.188e+04,30.19,0.0,0.00,,1.1,4,1,2.0,0,1,0,0,0,0,0,0,0,0,0,0
2,2,10090,Call of Duty: World at War,0,Treyarch,Activision,-1,-1,-1,0,2008-11-18,0.0,100.0,92.69,16971.0,19.99,9.79,51.0,['Action'],0,83.0,0,-1,-1,[],0,0,"['Zombies', 'World War II', 'FPS', 'Action', '...",1.674e+06,34382.0,1.424e+06,85.07,106163.0,6.34,12.5,42.1,41,0,7.0,1,0,0,0,0,0,0,0,0,0,0,0
3,3,10130,TimeShift,0,Saber Interactive,Activision,-1,-1,-1,0,2007-10-30,0.0,100.0,76.69,653.0,19.99,4.99,75.0,['Action'],0,71.0,0,-1,-1,[],0,0,"['Action', 'FPS', 'Time Manipulation', 'Sci-fi...",1.340e+05,9746.0,5.549e+04,41.41,1172.0,0.87,1.1,3.1,6,0,1.0,1,0,0,0,0,0,0,0,0,0,0,0
4,4,10180,Call of Duty: Modern Warfare 2,0,Infinity Ward,Activision,-1,-1,-1,0,2009-11-12,0.0,100.0,90.93,26992.0,19.99,9.79,51.0,['Action'],0,86.0,0,-1,-1,"['english', 'french', 'german', 'italian', 'ja...",8,0,"['Action', 'FPS', 'Multiplayer', 'Shooter', 'F...",5.246e+06,60607.0,4.687e+06,89.33,70157.0,1.34,9.4,23.5,37,0,18.0,1,0,0,0,0,0,0,0,0,0,0,0
5,5,10220,Postal 3,0,Trashmasters,Akella,-1,-1,-1,0,2011-12-21,,,,,11.99,4.79,60.0,['Action'],0,24.0,0,-1,-1,"['english', 'russian']",2,0,"['Action', 'Dark Humor', 'Gore', 'Open World',...",7.830e+04,8137.0,6.901e+04,88.14,2491.0,3.18,2.1,7.1,3,0,4.0,1,0,0,0,0,0,0,0,0,0,0,0
6,6,102400,Vertex Dispenser,0,Michael Brough,Michael Brough,1,1,0,1,2011-06-10,,,,,9.99,4.99,50.0,"['Action', 'Indie', 'Strategy']",0,70.0,0,-1,-1,[],0,0,"['Strategy', 'Action', 'Indie']",3.304e+04,4840.0,1.299e+04,39.33,0.0,0.00,,,5,1,2.0,1,1,1,0,0,0,0,0,0,0,0,0
7,7,10250,PT Boats: Knights of the Sea,0,Studio4,Akella,-1,-1,-1,0,2009-01-30,,,,,6.99,2.49,75.0,['Simulation'],0,,-1,-1,-1,[],0,0,['Simulation'],1.151e+04,2856.0,9.837e+03,85.49,0.0,0.00,1.4,2.7,4,0,3.0,0,0,0,1,0,0,0,0,0,0,0,0
8,8,102500,Kingdoms of Amalur: Reckoning™,0,Big Huge Games,38 Studios,-1,-1,-1,0,2012-02-07,0.0,100.0,87.28,7748.0,19.99,4.39,78.0,"['Action', 'RPG']",3,81.0,0,-1,-1,"['english', 'french', 'german', 'italian', 'sp...",5,0,"['RPG', 'Fantasy', 'Open World', 'Singleplayer...",8.919e+05,27437.0,7.470e+05,83.76,19465.0,2.18,9.2,26.6,6,4,13.0,1,0,0,0,1,0,0,0,0,0,0,0
9,9,10260,PT Boats: South Gambit,0,studio4,Akella,-1,-1,-1,0,2010-12-10,,,,,6.99,2.49,75.0,['Simulation'],0,,-1,-1,-1,[],0,0,['Simulation'],1.132e+04,2833.0,6.867e+03,60.65,0.0,0.00,,2.0,4,0,3.0,0,0,0,1,0,0,0,0,0,0,0,0


## Remove non-numerical columns, and irrelevant columns

In [145]:
df_for_model1.columns

Index(['Unnamed: 0', 'ID_num', 'Name', 'applicationCategory', 'Developer',
       'Publisher', 'OS_windows', 'OS_mac', 'OS_linux', 'SteamPlay',
       'Release_Date', 'worstRating', 'bestRating', 'ratingValue',
       'reviewCount', 'Price', 'Lowest_Price', 'Max_Sale', 'genres',
       'controller_support', 'metacritic_score', 'community_visible_stats',
       'workshop_visible', 'releasestate', 'Achievement_Languages',
       'languages_num', 'community_hub_visible', 'store_tags', 'owners',
       'owners_unc', 'players_total', 'owners_played_percent',
       'players_2_weeks', 'players_2_weeks_percent', 'median_total_playtime',
       'average_total_playtime', 'Packages', 'DLCs', 'Depots', 'genre_Action',
       'genre_Indie', 'genre_Strategy', 'genre_Simulation', 'genre_RPG',
       'genre_Adventure', 'genre_Casual', 'genre_Early Access', 'genre_Racing',
       'genre_Sports', 'genre_Free to Play', 'genre_Massively Multiplayer'],
      dtype='object')

In [146]:
df.head()

Unnamed: 0.1,Unnamed: 0,ID_num,Name,applicationCategory,Developer,Publisher,OS_windows,OS_mac,OS_linux,SteamPlay,Release_Date,worstRating,bestRating,ratingValue,reviewCount,Price,Lowest_Price,Max_Sale,genres,controller_support,metacritic_score,community_visible_stats,workshop_visible,releasestate,Achievement_Languages,languages_num,community_hub_visible,store_tags,owners,owners_unc,players_total,owners_played_percent,players_2_weeks,players_2_weeks_percent,median_total_playtime,average_total_playtime,Packages,DLCs,Depots,genre_Action,genre_Indie,genre_Strategy,genre_Simulation,genre_RPG,genre_Adventure,genre_Casual,genre_Early Access,genre_Racing,genre_Sports,genre_Free to Play,genre_Massively Multiplayer
0,0,10,Counter-Strike,0,Valve,Valve,1,1,1,1,2000-11-01,0.0,100.0,97.74,93913.0,9.99,2.49,75.0,['Action'],0,88.0,0,-1,-1,[],0,0,"['Action', 'FPS', 'Multiplayer', 'Shooter', 'C...",13440970,96036,9426812,70.13,361920,2.69,6.9,185.9,65,0,24.0,1,0,0,0,0,0,0,0,0,0,0,0
1,1,1002,Rag Doll Kung Fu,0,Mark Healey,Mark Healey,-1,-1,-1,0,2005-10-12,,,,,9.99,2.49,75.0,['Indie'],0,69.0,-1,-1,-1,[],0,0,"['Indie', 'Fighting']",39347,5282,11878,30.19,0,0.0,23 minute,1.1,4,1,2.0,0,1,0,0,0,0,0,0,0,0,0,0
2,2,10090,Call of Duty: World at War,0,Treyarch,Activision,-1,-1,-1,0,2008-11-18,0.0,100.0,92.69,16971.0,19.99,9.79,51.0,['Action'],0,83.0,0,-1,-1,[],0,0,"['Zombies', 'World War II', 'FPS', 'Action', '...",1673741,34382,1423924,85.07,106163,6.34,12.5,42.1,41,0,7.0,1,0,0,0,0,0,0,0,0,0,0,0
3,3,10130,TimeShift,0,Saber Interactive,Activision,-1,-1,-1,0,2007-10-30,0.0,100.0,76.69,653.0,19.99,4.99,75.0,['Action'],0,71.0,0,-1,-1,[],0,0,"['Action', 'FPS', 'Time Manipulation', 'Sci-fi...",134003,9746,55494,41.41,1172,0.87,1.1,3.1,6,0,1.0,1,0,0,0,0,0,0,0,0,0,0,0
4,4,10180,Call of Duty: Modern Warfare 2,0,Infinity Ward,Activision,-1,-1,-1,0,2009-11-12,0.0,100.0,90.93,26992.0,19.99,9.79,51.0,['Action'],0,86.0,0,-1,-1,"['english', 'french', 'german', 'italian', 'ja...",8,0,"['Action', 'FPS', 'Multiplayer', 'Shooter', 'F...",5246171,60607,4686587,89.33,70157,1.34,9.4,23.5,37,0,18.0,1,0,0,0,0,0,0,0,0,0,0,0


In [147]:
bad = ['Unnamed: 0',  'Developer',\
       'Publisher', \
       'genres',\
       'Achievement_Languages',\
       'store_tags']
#'Name',

df_for_model1.drop(bad, inplace=True,axis=1)
df_for_model1.head()

Unnamed: 0,ID_num,Name,applicationCategory,OS_windows,OS_mac,OS_linux,SteamPlay,Release_Date,worstRating,bestRating,ratingValue,reviewCount,Price,Lowest_Price,Max_Sale,controller_support,metacritic_score,community_visible_stats,workshop_visible,releasestate,languages_num,community_hub_visible,owners,owners_unc,players_total,owners_played_percent,players_2_weeks,players_2_weeks_percent,median_total_playtime,average_total_playtime,Packages,DLCs,Depots,genre_Action,genre_Indie,genre_Strategy,genre_Simulation,genre_RPG,genre_Adventure,genre_Casual,genre_Early Access,genre_Racing,genre_Sports,genre_Free to Play,genre_Massively Multiplayer
0,10,Counter-Strike,0,1,1,1,1,2000-11-01,0.0,100.0,97.74,93913.0,9.99,2.49,75.0,0,88.0,0,-1,-1,0,0,13440000.0,96036.0,9427000.0,70.13,361920.0,2.69,6.9,185.9,65,0,24.0,1,0,0,0,0,0,0,0,0,0,0,0
1,1002,Rag Doll Kung Fu,0,-1,-1,-1,0,2005-10-12,,,,,9.99,2.49,75.0,0,69.0,-1,-1,-1,0,0,39350.0,5282.0,11880.0,30.19,0.0,0.0,,1.1,4,1,2.0,0,1,0,0,0,0,0,0,0,0,0,0
2,10090,Call of Duty: World at War,0,-1,-1,-1,0,2008-11-18,0.0,100.0,92.69,16971.0,19.99,9.79,51.0,0,83.0,0,-1,-1,0,0,1674000.0,34382.0,1424000.0,85.07,106163.0,6.34,12.5,42.1,41,0,7.0,1,0,0,0,0,0,0,0,0,0,0,0
3,10130,TimeShift,0,-1,-1,-1,0,2007-10-30,0.0,100.0,76.69,653.0,19.99,4.99,75.0,0,71.0,0,-1,-1,0,0,134000.0,9746.0,55490.0,41.41,1172.0,0.87,1.1,3.1,6,0,1.0,1,0,0,0,0,0,0,0,0,0,0,0
4,10180,Call of Duty: Modern Warfare 2,0,-1,-1,-1,0,2009-11-12,0.0,100.0,90.93,26992.0,19.99,9.79,51.0,0,86.0,0,-1,-1,8,0,5246000.0,60607.0,4687000.0,89.33,70157.0,1.34,9.4,23.5,37,0,18.0,1,0,0,0,0,0,0,0,0,0,0,0


## Create a log10(owners) column

In [148]:

df_for_model1["owners_log"]=df_for_model1['owners'].map(lambda x: np.log10(x))

## Write cleaned data to file

In [149]:
df_for_model1.to_csv('ModelData1.csv')

In [93]:
print(df.releasestate.unique())

def catagorize(item,subdict):
    values_dict = {}
    index = 0
    for entry in df.releasestate.unique():
        values_dict[entry] = index
    print(values_dict)
    
    
df['releasestate'] = df['releasestate'].map(catagorize)

[nan 'released' 'prerelease' 'preloadonly']


TypeError: catagorize() missing 1 required positional argument: 'subdict'

In [13]:
u = [1,3,5,7,9]
u = u[::-1]
u.pop(),u.pop(),u.pop()

(1, 3, 5)

In [63]:
dftest = copy.deepcopy(df)

In [285]:
dftest.releasestate = pd.Categorical(dftest.releasestate)
dftest['releasestate_code'] = dftest.releasestate.cat.codes
dftest.head(100)

Unnamed: 0.1,Unnamed: 0,ID_num,Name,applicationCategory,Developer,Publisher,OS_windows,OS_mac,OS_linux,SteamPlay,Release_Date,worstRating,bestRating,ratingValue,reviewCount,Price,Lowest_Price,Max_Sale,genres,controller_support,metacritic_score,community_visible_stats,workshop_visible,releasestate,Achievement_Languages,languages_num,community_hub_visible,store_tags,releasestate.1,owners,owners_unc,players_total,owners_played_percent,players_2_weeks,players_2_weeks_percent,median_total_playtime,average_total_playtime,Packages,DLCs,Depots,ReleaseDT,releasestate_code
0,0,10220,Postal 3,Game,Trashmasters,Akella,,,,False,2011-12-21 00:00:00,,,,,11.99,4.79,60.0,['Action'],,24.0,Yes,,,"['english', 'russian']",2,Yes,"['Action', 'Dark Humor', 'Gore', 'Open World',...",,78302,8137,69012,88.14%,2491,3.18%,2.1 hours,7.1 hours,3,0,4.0,2011-12-21,-1
1,1,102500,Kingdoms of Amalur: Reckoning™,Game,Big Huge Games,38 Studios,,,,False,2012-02-07 00:00:00,0.0,100.0,87.28,7748.0,19.99,4.39,78.0,"['Action', 'RPG']",partial,81.0,Yes,,,"['english', 'french', 'german', 'italian', 'sp...",5,Yes,"['RPG', 'Fantasy', 'Open World', 'Singleplayer...",,891851,27437,746970,83.76%,19465,2.18%,9.2 hours,26.6 hours,6,4,13.0,2012-02-07,-1
2,2,102600,Orcs Must Die!,Game,Robot Entertainment,Robot Entertainment,,,,False,2011-10-11 00:00:00,0.0,100.0,96.53,5010.0,9.99,0.99,90.0,"['Action', 'Indie', 'Strategy']",partial,83.0,Yes,,,"['brazilian', 'english', 'french', 'german', '...",9,Yes,"['Tower Defense', 'Action', 'Strategy', 'Third...",,1528003,35885,941620,61.62%,7963,0.52%,3.1 hours,7.5 hours,14,7,18.0,2011-10-11,-1
3,3,102840,Shank 2,Game,Klei Entertainment,Klei Entertainment,True,True,False,True,2012-02-07 00:00:00,0.0,100.0,81.91,1553.0,9.99,1.49,85.0,"['Action', 'Adventure', 'Indie']",full,72.0,Yes,,,[],0,Yes,"['Action', ""Beat 'em up"", 'Indie', 'Adventure'...",,602973,22567,284676,47.21%,3981,0.66%,1.5 hours,3.7 hours,8,1,3.0,2012-02-07,-1
4,4,104200,BEEP,Game,Big Fat Alien,Big Fat Alien,True,False,False,False,2011-05-06 00:00:00,0.0,100.0,84.15,4637.0,1.99,0.19,96.0,"['Adventure', 'Indie']",,,,,,[],0,Yes,"['Indie', 'Platformer', 'Adventure', 'Puzzle',...",,689017,24121,456100,66.2%,2876,0.42%,1.5 hours,2.3 hours,4,1,3.0,2011-05-06,-1
5,5,105000,A New Beginning - Final Cut,Game,Daedalic Entertainment,Daedalic Entertainment,True,True,False,True,2012-12-11 00:00:00,,,,,9.99,0.99,90.0,"['Adventure', 'Indie']",,72.0,Yes,,,"['english', 'french', 'german', 'italian', 'po...",7,Yes,"['Adventure', 'Point & Click', 'Indie', 'Sci-f...",,295072,15793,82947,28.11%,669,0.23%,2.1 hours,5.5 hours,15,1,22.0,2012-12-11,-1
6,6,105450,Age of Empires® III: Complete Collection,Game,Ensemble Studios,Microsoft Studios,,,,False,2009-09-15 00:00:00,0.0,100.0,89.24,10672.0,39.99,4.39,89.0,"['Simulation', 'Strategy']",,81.0,Yes,,,[],0,Yes,"['Strategy', 'RTS', 'Base Building', 'Historic...",,2121022,42249,1769546,83.43%,112366,5.3%,5.4 hours,25.2 hours,10,0,6.0,2009-09-15,-1
7,7,105600,Terraria,Game,Re-Logic,Re-Logic,True,True,True,True,2011-05-16 00:00:00,0.0,100.0,96.96,186859.0,9.99,1.99,80.0,"['Action', 'Adventure', 'Indie', 'RPG']",full,83.0,Yes,,,[],0,Yes,"['Sandbox', 'Adventure', 'Survival', '2D', 'Mu...",,8834680,85522,8392736,95%,778821,8.82%,23.4 hours,79.9 hours,10,1,4.0,2011-05-16,-1
8,8,105800,PixelJunk Eden,Game,Q-Games Ltd.,Q-Games Ltd.,,,,False,2012-02-02 00:00:00,0.0,100.0,77.47,657.0,9.99,0.99,90.0,"['Action', 'Casual', 'Indie']",full,81.0,Yes,,,"['english', 'french', 'italian', 'japanese', '...",5,Yes,"['Casual', 'Indie', 'Platformer', 'Great Sound...",,318518,16408,148421,46.6%,1106,0.35%,1.1 hours,2.1 hours,6,1,2.0,2012-02-02,-1
9,9,106000,The Cursed Crusade,Game,Kylotonn Entertainment,ATLUS USA,,,,False,2011-10-25 00:00:00,,,,,19.99,2.99,85.0,"['Action', 'Adventure']",,55.0,Yes,,,"['english', 'french', 'german', 'italian', 'po...",7,Yes,"['Action', 'Adventure', 'Medieval', 'Hack and ...",,71445,7773,62376,87.31%,623,0.87%,3.7 hours,7.7 hours,10,0,8.0,2011-10-25,-1


In [92]:
dftest.releasestate.sample(50)

NameError: name 'dftest' is not defined

# Initial Peaks at the data  

on next part
