In [10]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

# Load dataset

In [11]:
FINAL_DATASET_PATH = os.path.join("final_dataset", "final_dataset.csv")

whole_steam_data = pd.read_csv(FINAL_DATASET_PATH)

In [12]:
whole_steam_data.columns

Index(['Unnamed: 0', 'appid', 'user_reviews', 'desc', 'full_desc', 'popu_tags',
       'name', 'release_date', 'english', 'developer', 'publisher', 'windows',
       'mac', 'linux', 'required_age', 'categories', 'genres', 'steamspy_tags',
       'achievements', 'average_playtime', 'median_playtime', 'owners',
       'price'],
      dtype='object')

In [13]:
whole_steam_data.head()

Unnamed: 0.1,Unnamed: 0,appid,user_reviews,desc,full_desc,popu_tags,name,release_date,english,developer,...,linux,required_age,categories,genres,steamspy_tags,achievements,average_playtime,median_playtime,owners,price
0,0,10,97.0,Play the world's number 1 online action game. ...,About This GamePlay the world's number 1 onlin...,ActionFPSMultiplayerShooterClassicTeam-BasedFi...,Counter-Strike,2000-11-01,1,Valve,...,1,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,17612,317,15000000.0,7.19
1,1,20,90.0,One of the most popular online action games of...,About This GameOne of the most popular online ...,ActionFPSMultiplayerClassicHero ShooterShooter...,Team Fortress Classic,1999-04-01,1,Valve,...,1,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,277,62,7500000.0,3.99
2,2,30,85.0,Enlist in an intense brand of Axis vs. Allied ...,About This GameEnlist in an intense brand of A...,FPSWorld War IIMultiplayerShooterActionWarTeam...,Day of Defeat,2003-05-01,1,Valve,...,1,0,Multi-player;Valve Anti-Cheat enabled,Action,FPS;World War II;Multiplayer,0,187,34,7500000.0,3.99
3,3,40,70.0,Enjoy fast-paced multiplayer gaming with Death...,About This GameEnjoy fast-paced multiplayer ga...,ActionFPSClassicMultiplayerShooterFirst-Person...,Deathmatch Classic,2001-06-01,1,Valve,...,1,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,258,184,7500000.0,3.99
4,4,50,94.0,Return to the Black Mesa Research Facility as ...,About This GameReturn to the Black Mesa Resear...,FPSActionClassicSci-fiSingleplayerShooterFirst...,Half-Life: Opposing Force,1999-11-01,1,Gearbox Software,...,1,0,Single-player;Multi-player;Valve Anti-Cheat en...,Action,FPS;Action;Sci-fi,0,624,415,7500000.0,3.99


In [None]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

In [None]:
num_pipeline = Pipeline([
        ("select_numeric", DataFrameSelector(["english", "windows", "mac", "linux", "required_age", "achievements", "average_playtime", "median_playtime", "owners", "price"])),
        ("imputer", SimpleImputer(strategy="median")),
    ])

In [None]:
num_pipeline.fit_transform(train_data)

In [None]:
# from future_encoders import OneHotEncoder
cat_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(["desc", "full_desc", "popu_tags", "name", "developer", "publisher"])),
        ("imputer", MostFrequentImputer()),
        ("cat_encoder", OneHotEncoder(sparse=False, handle_unknown='ignore')),
    ])

In [None]:
cat_pipeline.fit_transform(train_data)

In [None]:
pipe = Pipeline([
    ('preprocessing', preprocess_pipeline), 
    ('classifier', SVC(kernel='linear'))])


param_grid = {
            'classifier__C': [0.01, 0.1, 1, 2, 3, 4]
}

grid_1 = GridSearchCV(pipe, param_grid, cv=kfold)

grid_1.fit(X_train, y_train)
grid_1.best_params

In [None]:
best_grid = grid_1.best_estimator_.predict(test_data)