In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, recall_score,make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV,cross_val_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
import numpy as np

pd.set_option('display.max_columns', None)


In [2]:
final_data_df = pd.read_csv('final_data.csv')

X = final_data_df.drop(columns=['accepted']).copy()
y = final_data_df['accepted'].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=13)

final_data_df.head()

Unnamed: 0,discussions_count,comments_count,movie_count,screenshot_count,awards_count,link_count,file_size_mb,change_notes,certified_compatible,creator_count,featured_collection_count,visitors_count,favorites_count,accepted,class_count,item_slot_count,class_scout,class_soldier,class_pyro,class_demoman,class_heavy,class_engineer,class_medic,class_sniper,class_spy,item_slot_headgear,item_slot_misc,item_slot_taunt,item_slot_weapon,is_medal,passed_time,posted_year,updated_year,posted_month,updated_month,media_count,submitted_by_submitter,v1,v2,v3,v4,v5,v6,v7,v8
0,-0.299559,0.387851,0,6,0,0,0.026881,3.388062,1,0.044635,0.619064,-0.003128,0.170641,1,5,1,1,0,0,1,1,1,0,1,0,0,1,0,0,0,3.19428,2012,2016,9,3,6,2.741052,0.268692,154.0,5492.0,35.432258,0.857143,1,0,1
1,-0.299559,-0.119405,0,4,0,0,-0.363812,1.444371,1,0.044635,0.002978,-0.261479,-0.206975,1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,2.811226,2012,2015,9,10,4,2.741052,0.265896,40.0,2218.0,54.097561,0.8,1,1,1
2,-0.299559,0.761618,0,2,0,0,-0.107814,-0.499321,0,1.583537,0.485132,0.502919,0.808887,1,1,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,-0.308865,2012,2012,9,9,2,0.607169,0.116414,238.0,11905.0,49.811715,0.666667,0,0,1
3,-0.299559,0.757168,0,8,0,0,-0.051529,0.796474,0,0.044635,0.404773,0.359304,0.499389,1,9,1,1,1,1,1,1,1,1,1,1,0,1,0,0,0,-0.281847,2012,2012,9,10,8,0.240555,0.14,237.0,10085.0,42.37395,0.888889,1,1,1
4,0.320877,3.14216,0,0,0,0,-0.8269,-0.499321,0,-0.724816,-0.255957,2.1833,-0.312116,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,-0.308865,2012,2012,2,2,0,-0.821686,0.166667,193.25,8300.0,42.894057,0.0,1,1,1


In [3]:
lg1 = LogisticRegression(max_iter=10000000, random_state=13,class_weight={0: 1, 1: 4})
lg1.fit(X_train, y_train)

y_pred = lg1.predict(X_test)

print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}')

print(f'Accuracy Score: {accuracy_score(y_test,y_pred)}')
print(f'Recall score: {recall_score(y_test,y_pred)}')
print(f'F1 score: {f1_score(y_test,y_pred)}')

Confusion Matrix: 
[[5818  279]
 [ 207  326]]
Accuracy Score: 0.9266968325791856
Recall score: 0.6116322701688556
F1 score: 0.5729349736379614


In [4]:
lg2 = LogisticRegressionCV(max_iter=10000000, random_state=13,class_weight={0: 1, 1: 4})
lg2.fit(X_train, y_train)

y_pred = lg2.predict(X_test)

print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}')

print(f'Accuracy Score: {accuracy_score(y_test,y_pred)}')
print(f'Recall score: {recall_score(y_test,y_pred)}')
print(f'F1 score: {f1_score(y_test,y_pred)}')

Confusion Matrix: 
[[5805  292]
 [ 201  332]]
Accuracy Score: 0.9256410256410257
Recall score: 0.6228893058161351
F1 score: 0.5738980121002593


In [5]:
tree = DecisionTreeClassifier(class_weight={0:1,1:4},random_state=42)
tree_grid = GridSearchCV(tree, dict(max_depth=range(2,20,2),min_samples_split=range(2,30,2)), scoring=make_scorer(f1_score),n_jobs=-1)
tree_grid.fit(X_train, y_train)

print(tree_grid.best_params_)
print(tree_grid.best_score_)

y_pred = tree_grid.predict(X_test)

print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}')

print(f'Accuracy Score: {accuracy_score(y_test,y_pred)}')
print(f'Recall score: {recall_score(y_test,y_pred)}')
print(f'F1 score: {f1_score(y_test,y_pred)}')

{'max_depth': 10, 'min_samples_split': 4}
0.6416001119666405
Confusion Matrix: 
[[5812  285]
 [ 183  350]]
Accuracy Score: 0.9294117647058824
Recall score: 0.6566604127579737
F1 score: 0.5993150684931506


In [6]:
tree2 = ExtraTreesClassifier(class_weight={0:1,1:4},random_state=42)
tree2_grid = GridSearchCV(tree2, dict(max_depth=range(2,20,2),min_samples_split=range(2,30,2)), scoring=make_scorer(f1_score),n_jobs=-1)
tree2_grid.fit(X_train, y_train)

print(tree2_grid.best_params_)
print(tree2_grid.best_score_)

y_pred = tree2_grid.predict(X_test)

print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}')

print(f'Accuracy Score: {accuracy_score(y_test,y_pred)}')
print(f'Recall score: {recall_score(y_test,y_pred)}')
print(f'F1 score: {f1_score(y_test,y_pred)}')

{'max_depth': 18, 'min_samples_split': 12}
0.6289737151907864
Confusion Matrix: 
[[6025   72]
 [ 255  278]]
Accuracy Score: 0.9506787330316742
Recall score: 0.5215759849906192
F1 score: 0.6296715741789355


In [7]:
rand_forest = RandomForestClassifier(class_weight={0:1,1:4},random_state=42)
rand_forest_grid = GridSearchCV(rand_forest, dict(n_estimators=range(3,70,3)), scoring=make_scorer(f1_score),n_jobs=-1)
rand_forest_grid.fit(X_train, y_train)

print(rand_forest_grid.best_params_)
print(rand_forest_grid.best_score_)

y_pred = rand_forest_grid.predict(X_test)

print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}')

print(f'Accuracy Score: {accuracy_score(y_test,y_pred)}')
print(f'Recall score: {recall_score(y_test,y_pred)}')
print(f'F1 score: {f1_score(y_test,y_pred)}')

{'n_estimators': 39}
0.6357083415222518
Confusion Matrix: 
[[6083   14]
 [ 279  254]]
Accuracy Score: 0.9558069381598794
Recall score: 0.47654784240150094
F1 score: 0.6342072409488139


In [8]:
voting = VotingClassifier(estimators=[('lg1', lg1), ('tree', tree_grid), ('rand_forest_grid', rand_forest_grid)], voting='soft',n_jobs=-1)
voting.fit(X_train, y_train)

y_pred = voting.predict(X_test)

print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}')

print(f'Accuracy Score: {accuracy_score(y_test,y_pred)}')
print(f'Recall score: {recall_score(y_test,y_pred)}')
print(f'F1 score: {f1_score(y_test,y_pred)}')

Confusion Matrix: 
[[5999   98]
 [ 205  328]]
Accuracy Score: 0.9542986425339367
Recall score: 0.6153846153846154
F1 score: 0.6840458811261731
