In [1]:
from sklearn import datasets
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import SteamVariables as sv


dt = pd.read_csv(sv.CSV_PATH, nrows=20000) # nrows=2000000
pd.set_option('display.max_columns', None)

# Eliminar colunas desnecessarias
dt.drop(["Unnamed: 0"], axis=1, inplace=True)
dt.drop([sv.STEAM_PURCHASE], axis=1, inplace=True)
dt.drop([sv.WRITTEN_DURING_EARLY_ACCESS], axis=1, inplace=True)
dt.drop([sv.APP_NAME], axis=1, inplace=True)
dt.drop([sv.REVIEW], axis=1, inplace=True)
dt.drop([sv.TIMESTAMP_UPDATED], axis=1, inplace=True)
dt.drop([sv.TIMESTAMP_CREATED], axis=1, inplace=True)
dt.drop([sv.LANGUAGE], axis=1, inplace=True)

# Resetar index
dt.reset_index(drop=True, inplace=True)

# Eliminar linhas com valores nulos
dt[sv.AUTHOR_NUM_GAMES_OWNED].fillna(dt[sv.AUTHOR_NUM_GAMES_OWNED].mean(), inplace=True)
dt[sv.AUTHOR_NUM_REVIEWS].fillna(dt[sv.AUTHOR_NUM_REVIEWS].mean(), inplace=True)
dt[sv.AUTHOR_PLAYTIME_FOREVER].fillna(dt[sv.AUTHOR_PLAYTIME_FOREVER].mean(), inplace=True)
dt[sv.AUTHOR_PLAYTIME_LAST_TWO_WEEKS].fillna(dt[sv.AUTHOR_PLAYTIME_LAST_TWO_WEEKS].mean(), inplace=True)
dt[sv.AUTHOR_LAST_PLAYED].fillna(dt[sv.AUTHOR_LAST_PLAYED].mean(), inplace=True)



In [2]:
# Transformar valores booleanos em inteiros
dt[sv.RECOMMENDED] = dt[sv.RECOMMENDED].map({True: 1, False: 0})


dtTraining = dt.drop(sv.RECOMMENDED, axis=1)
dtTest = dt[sv.RECOMMENDED]
X_train, X_test, Y_train, Y_test = train_test_split(dtTraining, dtTest, test_size=0.3, random_state=5)



In [3]:

##Proprotions
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(14000, 14)
(6000, 14)
(14000,)
(6000,)


In [4]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier(n_neighbors=5)),
])
pipe.get_params()
pd.DataFrame(pipe.get_params())

Unnamed: 0,memory,steps,verbose,scaler,knn,scaler__copy,scaler__with_mean,scaler__with_std,knn__algorithm,knn__leaf_size,knn__metric,knn__metric_params,knn__n_jobs,knn__n_neighbors,knn__p,knn__weights
0,,"(scaler, StandardScaler())",False,StandardScaler(),KNeighborsClassifier(),True,True,True,auto,30,minkowski,,,5,2,uniform
1,,"(knn, KNeighborsClassifier())",False,StandardScaler(),KNeighborsClassifier(),True,True,True,auto,30,minkowski,,,5,2,uniform


In [5]:
#!Escolhe qual é a melhor opção com diferentes parametros parametros
mod = GridSearchCV(estimator=pipe,
                   param_grid={'knn__n_neighbors':[1,2,3,4,5,6,7,8,9,10],
                               'knn__weights':['uniform','distance'],
                               'knn__leaf_size':[1,2,3,4,5,6,7,8,9,10],
                               'knn__p':[1,2]},
                   cv=3)

In [6]:
mod.fit(X_train,Y_train)
print(mod.best_params_)
pd.DataFrame(mod.cv_results_)

{'knn__leaf_size': 1, 'knn__n_neighbors': 6, 'knn__p': 1, 'knn__weights': 'uniform'}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_knn__leaf_size,param_knn__n_neighbors,param_knn__p,param_knn__weights,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.027038,0.001784,0.504754,0.035194,1,1,1,uniform,"{'knn__leaf_size': 1, 'knn__n_neighbors': 1, '...",0.969359,0.969788,0.970424,0.969857,0.000438,321
1,0.024394,0.000630,0.476414,0.061055,1,1,1,distance,"{'knn__leaf_size': 1, 'knn__n_neighbors': 1, '...",0.969359,0.969788,0.970424,0.969857,0.000438,321
2,0.026704,0.000965,1.004234,0.073592,1,1,2,uniform,"{'knn__leaf_size': 1, 'knn__n_neighbors': 1, '...",0.965717,0.968502,0.966138,0.966786,0.001226,351
3,0.026435,0.000722,0.951061,0.072500,1,1,2,distance,"{'knn__leaf_size': 1, 'knn__n_neighbors': 1, '...",0.965717,0.968502,0.966138,0.966786,0.001226,351
4,0.024032,0.000408,0.592513,0.036653,1,2,1,uniform,"{'knn__leaf_size': 1, 'knn__n_neighbors': 2, '...",0.960146,0.965288,0.967424,0.964286,0.003055,381
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,0.020204,0.000857,0.619313,0.024471,10,9,2,distance,"{'knn__leaf_size': 10, 'knn__n_neighbors': 9, ...",0.974288,0.974716,0.973210,0.974071,0.000633,211
396,0.020276,0.000605,0.587414,0.022405,10,10,1,uniform,"{'knn__leaf_size': 10, 'knn__n_neighbors': 10,...",0.975359,0.976002,0.972353,0.974571,0.001590,151
397,0.022038,0.003575,0.520330,0.024463,10,10,1,distance,"{'knn__leaf_size': 10, 'knn__n_neighbors': 10,...",0.975145,0.976002,0.972353,0.974500,0.001558,161
398,0.020863,0.000628,0.685840,0.018974,10,10,2,uniform,"{'knn__leaf_size': 10, 'knn__n_neighbors': 10,...",0.974930,0.973859,0.971925,0.973571,0.001244,271


In [7]:

#Create the Classifier
clf = KNeighborsClassifier(n_neighbors=5)
#Train the model using the training sets
clf.fit(X_train, Y_train)
#Predict the response for test dataset
y_pred = clf.predict(X_test)
print("----------Classification report-----------")
print(classification_report(Y_test, y_pred))


----------Classification report-----------
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       168
           1       0.97      1.00      0.98      5832

    accuracy                           0.97      6000
   macro avg       0.49      0.50      0.49      6000
weighted avg       0.94      0.97      0.96      6000

