# Import libraries

In [1]:
# main libraries
import pandas as pd # used for handling the dataset
import numpy as np # used for handling numbers
import time

In [14]:

from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,f1_score,matthews_corrcoef,classification_report,roc_curve
from sklearn.externals import joblib




导入模型

In [2]:
# sklearn libraries
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import ExtraTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.decomposition import PCA
from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split # used for splitting training and testing data
from sklearn.impute import SimpleImputer # used for handling missing data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder # used for encoding categorical data
from sklearn.preprocessing import StandardScaler # used for feature scaling
from sklearn.preprocessing import normalize

# Read data

In [3]:
# title = ['id', 'playtime_forever', 'price', 'genres', 'categories','tags','purchase_date','release_date','total_positive_reviews','total_negative_reviews']
df = pd.read_csv('data/train.csv')
df.head()

Unnamed: 0,id,playtime_forever,is_free,price,genres,categories,tags,purchase_date,release_date,total_positive_reviews,total_negative_reviews
0,0,0.0,False,3700.0,"Adventure,Casual,Indie","Single-player,Steam Trading Cards,Steam Cloud","Indie,Adventure,Story Rich,Casual,Atmospheric,...","Jul 2, 2018","10 Dec, 2013",372.0,96.0
1,1,0.016667,True,0.0,RPG,"Single-player,Partial Controller Support","Mod,Utilities,RPG,Game Development,Singleplaye...","Nov 26, 2016","12 Aug, 2015",23.0,0.0
2,2,0.0,False,5000.0,"Adventure,Casual,Indie","Single-player,Full controller support,Steam Tr...","Point & Click,Adventure,Story Rich,Comedy,Indi...","Jul 2, 2018","28 Jan, 2014",3018.0,663.0
3,3,1.533333,False,9900.0,"Action,RPG","Single-player,Multi-player,Steam Achievements,...","Medieval,RPG,Open World,Strategy,Sandbox,Actio...","Nov 28, 2016","31 Mar, 2010",63078.0,1746.0
4,4,22.333333,False,4800.0,"Action,Indie,Strategy","Single-player,Co-op,Steam Achievements,Full co...","Tower Defense,Co-op,Action,Strategy,Online Co-...","Mar 4, 2018","30 Jul, 2012",8841.0,523.0


In [4]:
df.shape

(357, 11)

# Data Processing
· id不变。

· genres字段：是分类字段，要转成数字。首先将Genres中的类别转成字符串到数字的字典，然后再将每个游戏的Genres字段转成数字列表，因为有些游戏是多个Genres的组合。

· categories字段：处理方式跟genres字段一样,首先创建文本到数字的字典，然后将categories字段中的描述转成数字的列表。另外categories字段中的年份也需要去掉。

· tags字段：处理方式跟genres字段一样,首先创建文本到数字的字典，然后将tags字段中的描述转成数字的列表。另外tags字段中的年份也需要去掉。

·genres和categories和tags字段需要将长度统一，这样在神经网络中方便处理。空白部分用‘< PAD >’对应的数字填充。




In [5]:
# Checking for missing values
df.isnull().any().sum()

3

## handling the missing data

In [119]:
# 游戏genres转数字字典
genres_set = set()
for val in df['genres'].str.split(','):
    genres_set.update(val)

genres_set.add('<PAD>')
genres2int = {val:ii for ii, val in enumerate(genres_set)}
# print(genres2int)

#将游戏genres转成等长数字列表，长度是18
genres_map = {val:[genres2int[row] for row in val.split(',')] for ii,val in enumerate(set(df['genres']))}
for key in genres_map:
    for cnt in range(max(genres2int.values()) - len(genres_map[key])):
        genres_map[key].insert(len(genres_map[key]) + cnt,genres2int['<PAD>'])
    
df['genres'] = df['genres'].map(genres_map)
df.head()

Unnamed: 0,id,playtime_forever,is_free,price,genres,categories,tags,purchase_date,release_date,total_positive_reviews,total_negative_reviews
0,0,0.0,False,3700.0,"[3, 18, 17, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7...","Single-player,Steam Trading Cards,Steam Cloud","Indie,Adventure,Story Rich,Casual,Atmospheric,...","Jul 2, 2018","10 Dec, 2013",372.0,96.0
1,1,0.016667,True,0.0,"[11, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,...","Single-player,Partial Controller Support","Mod,Utilities,RPG,Game Development,Singleplaye...","Nov 26, 2016","12 Aug, 2015",23.0,0.0
2,2,0.0,False,5000.0,"[3, 18, 17, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7...","Single-player,Full controller support,Steam Tr...","Point & Click,Adventure,Story Rich,Comedy,Indi...","Jul 2, 2018","28 Jan, 2014",3018.0,663.0
3,3,1.533333,False,9900.0,"[14, 11, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7...","Single-player,Multi-player,Steam Achievements,...","Medieval,RPG,Open World,Strategy,Sandbox,Actio...","Nov 28, 2016","31 Mar, 2010",63078.0,1746.0
4,4,22.333333,False,4800.0,"[14, 17, 2, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7...","Single-player,Co-op,Steam Achievements,Full co...","Tower Defense,Co-op,Action,Strategy,Online Co-...","Mar 4, 2018","30 Jul, 2012",8841.0,523.0


In [121]:
#游戏categories转数字字典
categories_set = set()
for val in df['categories'].str.split(','):
    categories_set.update(val)
    
categories_set.add('<PAD>')
categories2int = {val:ii for ii, val in enumerate(categories_set)}
print(categorie2int)

#将游戏categories转成等长数字列表，长度是18
categories_map = {val:[categories2int[row] for row in val.split(',')] for ii,val in enumerate(set(df['categories']))}
for key in categories_map:
    for cnt in range(max(categories2int.values()) - len(categories_map[key])):
        categories_map[key].insert(len(categories_map[key]) + cnt,categories2int['<PAD>'])
    
df['categories'] = df['categories'].map(categories_map)
df.head()

{'Remote Play on Phone': 0, 'VR Support': 1, 'Steam Achievements': 2, 'Commentary available': 3, 'Remote Play on Tablet': 4, 'Co-op': 5, 'Multi-player': 6, 'Cross-Platform Multiplayer': 7, 'Single-player': 8, 'Valve Anti-Cheat enabled': 9, 'Includes level editor': 10, 'Local Co-op': 11, '<PAD>': 12, 'Captions available': 13, 'Online Multi-Player': 14, 'Remote Play on TV': 15, 'SteamVR Collectibles': 16, 'Shared/Split Screen': 17, 'Local Multi-Player': 18, 'Includes Source SDK': 19, 'In-App Purchases': 20, 'Full controller support': 21, 'Steam Workshop': 22, 'Steam Leaderboards': 23, 'Stats': 24, 'Steam Trading Cards': 25, 'Steam Cloud': 26, 'Online Co-op': 27, 'Partial Controller Support': 28, 'MMO': 29}


Unnamed: 0,id,playtime_forever,is_free,price,genres,categories,tags,purchase_date,release_date,total_positive_reviews,total_negative_reviews
0,0,0.0,False,3700.0,"[3, 18, 17, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7...","[8, 25, 26, 12, 12, 12, 12, 12, 12, 12, 12, 12...","Indie,Adventure,Story Rich,Casual,Atmospheric,...","Jul 2, 2018","10 Dec, 2013",372.0,96.0
1,1,0.016667,True,0.0,"[11, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,...","[8, 28, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12...","Mod,Utilities,RPG,Game Development,Singleplaye...","Nov 26, 2016","12 Aug, 2015",23.0,0.0
2,2,0.0,False,5000.0,"[3, 18, 17, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7...","[8, 21, 25, 26, 12, 12, 12, 12, 12, 12, 12, 12...","Point & Click,Adventure,Story Rich,Comedy,Indi...","Jul 2, 2018","28 Jan, 2014",3018.0,663.0
3,3,1.533333,False,9900.0,"[14, 11, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7...","[8, 6, 2, 25, 22, 12, 12, 12, 12, 12, 12, 12, ...","Medieval,RPG,Open World,Strategy,Sandbox,Actio...","Nov 28, 2016","31 Mar, 2010",63078.0,1746.0
4,4,22.333333,False,4800.0,"[14, 17, 2, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7...","[8, 5, 2, 21, 25, 26, 23, 12, 12, 12, 12, 12, ...","Tower Defense,Co-op,Action,Strategy,Online Co-...","Mar 4, 2018","30 Jul, 2012",8841.0,523.0


In [125]:
#游戏tags转数字字典
tags_set = set()
for val in df['tags'].str.split(','):
    tags_set.update(val)
    
tags_set.add('<PAD>')
tags2int = {val:ii for ii, val in enumerate(tags_set)}
print(tags2int)



{'Isometric': 0, 'Programming': 1, 'Romance': 2, '4 Player Local': 3, 'Software': 4, 'Text-Based': 5, 'Female Protagonist': 6, 'Minigames': 7, 'Building': 8, 'Family Friendly': 9, 'Intentionally Awkward Controls': 10, 'Memes': 11, 'Parkour': 12, 'Dragons': 13, 'Economy': 14, 'Turn-Based Combat': 15, 'Experience': 16, 'Pixel Graphics': 17, 'Dungeon Crawler': 18, 'God Game': 19, 'Snowboarding': 20, 'Chess': 21, '1980s': 22, 'Cartoon': 23, 'Bullet Hell': 24, 'Replay Value': 25, 'Batman': 26, 'Blood': 27, 'Addictive': 28, 'World War I': 29, 'Crime': 30, 'Emotional': 31, 'Episodic': 32, 'Logic': 33, 'Science': 34, 'Politics': 35, 'Minimalist': 36, 'Investigation': 37, 'Atmospheric': 38, 'Shooter': 39, 'Diplomacy': 40, 'Submarine': 41, 'Mouse only': 42, 'Music-Based Procedural Generation': 43, 'Rogue-lite': 44, 'Futuristic': 45, 'Base Building': 46, 'Singleplayer': 47, 'Sandbox': 48, 'Platformer': 49, 'Lovecraftian': 50, 'Supernatural': 51, 'Spectacle fighter': 52, 'Survival Horror': 53, 'Mu

20

In [126]:
# Splitting the attributes into independent and dependent attributes
X = df.iloc[:,[0,2,3,4,5,6,7,8,9,10]].values # independent variable set
#X = df.iloc[:,:-1].values #Takes all rows of all columns except the last column
X

array([[0, False, 3700.0, ..., '10 Dec, 2013', 372.0, 96.0],
       [1, True, 0.0, ..., '12 Aug, 2015', 23.0, 0.0],
       [2, False, 5000.0, ..., '28 Jan, 2014', 3018.0, 663.0],
       ...,
       [354, False, 8300.0, ..., '5 Nov, 2015', 5099.0, 1719.0],
       [355, False, 6800.0, ..., '3 Jun, 2016', 718.0, 159.0],
       [356, False, 10000.0, ..., '5 Dec, 2013', 915.0, 102.0]],
      dtype=object)

In [127]:
Y = df.iloc[:,1].values # dependent variable set
Y

array([0.00000000e+00, 1.66666667e-02, 0.00000000e+00, 1.53333333e+00,
       2.23333333e+01, 2.93333333e+00, 2.61666667e+00, 1.50000000e-01,
       1.66666667e-02, 1.66666667e-02, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 2.05666667e+01, 6.33333333e-01, 1.66666667e-02,
       3.30000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       1.66666667e-01, 1.61666667e+00, 0.00000000e+00, 5.00000000e-02,
       0.00000000e+00, 0.00000000e+00, 2.16666667e-01, 0.00000000e+00,
       0.00000000e+00, 4.21666667e+00, 4.26666667e+00, 4.33333333e+00,
       2.10000000e+00, 9.16666667e-01, 0.00000000e+00, 2.85000000e+00,
       5.66666667e+01, 0.00000000e+00, 4.51666667e+00, 3.08333333e+00,
       0.00000000e+00, 1.70000000e+00, 1.15166667e+01, 6.50000000e+00,
       8.83333333e-01, 4.66666667e-01, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 4.00000000e-01, 3.16666667e-01, 0.00000000e+00,
       0.00000000e+00, 1.98333333e+00, 1.45500000e+01, 2.00000000e-01,
      

In [54]:
# handling the missing data and replace missing values with nan from numpy and replace with mean of all the other values
imp = SimpleImputer(missing_values=np.nan, strategy="mean")
X = imp.fit_transform(X)
Y = Y.reshape(-1,1)
Y = imp.fit_transform(Y)
Y = Y.reshape(-1)

ValueError: Cannot use mean strategy with non-numeric data. Received datatype :O.