In [9]:
import pandas as pd
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, recall_score,make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV,cross_val_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
import numpy as np

pd.set_option('display.max_columns', None)


In [10]:
final_data_df = pd.read_csv('final_data.csv')

X = final_data_df.drop(columns=['accepted']).copy()
y = final_data_df['accepted'].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=13)

final_data_df.head()

Unnamed: 0,discussions_count,comments_count,movie_count,screenshot_count,awards_count,link_count,file_size_mb,change_notes,certified_compatible,creator_count,featured_collection_count,visitors_count,favorites_count,accepted,class_count,item_slot_count,class_scout,class_soldier,class_pyro,class_demoman,class_heavy,class_engineer,class_medic,class_sniper,class_spy,item_slot_headgear,item_slot_misc,item_slot_taunt,item_slot_weapon,is_medal,passed_time,posted_year,updated_year,posted_month,updated_month,media_count,submitted_by_submitter,v1,v2,v3,v4,v5,v6,v7,v8
0,-0.294358,0.405502,0,6,0,0,0.024026,3.444413,1,0.030408,0.644689,0.011062,0.189144,1,5,1,1,0,0,1,1,1,0,1,0,0,1,0,0,0,3.255625,2012,2016,9,3,6,2.806383,0.268692,154.0,5492.0,35.432258,0.857143,1,0,1
1,-0.294358,-0.111478,0,4,0,0,-0.367531,1.475342,1,0.030408,0.017785,-0.251927,-0.195062,1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,2.86651,2012,2015,9,10,4,2.806383,0.265896,40.0,2218.0,54.097561,0.8,1,1,1
2,-0.294358,0.786433,0,2,0,0,-0.110967,-0.493729,0,1.56526,0.508406,0.526197,0.838528,1,1,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,-0.302944,2012,2012,9,9,2,0.565009,0.116414,238.0,11905.0,49.811715,0.666667,0,0,1
3,-0.294358,0.781899,0,8,0,0,-0.054558,0.818985,0,0.030408,0.426636,0.380003,0.52363,1,9,1,1,1,1,1,1,1,1,1,1,0,1,0,0,0,-0.275498,2012,2012,9,10,8,0.419228,0.14,237.0,10085.0,42.37395,0.888889,1,1,1
4,0.337778,3.212608,0,0,0,0,-0.831642,-0.493729,0,-0.737019,-0.245697,2.236754,-0.302037,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,-0.302944,2012,2012,2,2,0,-0.819906,0.166667,193.25,8300.0,42.894057,0.0,1,1,1


In [11]:
lg1 = LogisticRegression(max_iter=10000000, random_state=13,class_weight={0: 1, 1: 4})
lg1.fit(X_train, y_train)

y_pred = lg1.predict(X_test)

print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}')

print(f'Accuracy Score: {accuracy_score(y_test,y_pred)}')
print(f'Recall score: {recall_score(y_test,y_pred)}')
print(f'F1 score: {f1_score(y_test,y_pred)}')

Confusion Matrix: 
[[6109  285]
 [ 193  319]]
Accuracy Score: 0.9307848247900377
Recall score: 0.623046875
F1 score: 0.57168458781362


In [12]:
lg2 = LogisticRegressionCV(max_iter=10000000, random_state=13,class_weight={0: 1, 1: 4})
lg2.fit(X_train, y_train)

y_pred = lg2.predict(X_test)

print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}')

print(f'Accuracy Score: {accuracy_score(y_test,y_pred)}')
print(f'Recall score: {recall_score(y_test,y_pred)}')
print(f'F1 score: {f1_score(y_test,y_pred)}')

Confusion Matrix: 
[[6133  261]
 [ 200  312]]
Accuracy Score: 0.9332464523602665
Recall score: 0.609375
F1 score: 0.5751152073732718


In [13]:
tree = DecisionTreeClassifier(class_weight={0:1,1:4},random_state=42)
tree_grid = GridSearchCV(tree, dict(max_depth=range(2,20,2),min_samples_split=range(2,30,2)), scoring=make_scorer(f1_score),n_jobs=-1)
tree_grid.fit(X_train, y_train)

print(tree_grid.best_params_)
print(tree_grid.best_score_)

y_pred = tree_grid.predict(X_test)

print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}')

print(f'Accuracy Score: {accuracy_score(y_test,y_pred)}')
print(f'Recall score: {recall_score(y_test,y_pred)}')
print(f'F1 score: {f1_score(y_test,y_pred)}')

{'max_depth': 10, 'min_samples_split': 4}
0.6348376585222792
Confusion Matrix: 
[[6087  307]
 [ 165  347]]
Accuracy Score: 0.9316536345207066
Recall score: 0.677734375
F1 score: 0.5951972555746141


In [14]:
tree2 = ExtraTreesClassifier(class_weight={0:1,1:4},random_state=42)
tree2_grid = GridSearchCV(tree2, dict(max_depth=range(2,20,2),min_samples_split=range(2,30,2)), scoring=make_scorer(f1_score),n_jobs=-1)
tree2_grid.fit(X_train, y_train)

print(tree2_grid.best_params_)
print(tree2_grid.best_score_)

y_pred = tree2_grid.predict(X_test)

print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}')

print(f'Accuracy Score: {accuracy_score(y_test,y_pred)}')
print(f'Recall score: {recall_score(y_test,y_pred)}')
print(f'F1 score: {f1_score(y_test,y_pred)}')

{'max_depth': 18, 'min_samples_split': 12}
0.6328772675503516
Confusion Matrix: 
[[6323   71]
 [ 241  271]]
Accuracy Score: 0.9548218940052129
Recall score: 0.529296875
F1 score: 0.6346604215456675


In [15]:
rand_forest = RandomForestClassifier(class_weight={0:1,1:4},random_state=42)
rand_forest_grid = GridSearchCV(rand_forest, dict(n_estimators=range(3,70,3)), scoring=make_scorer(f1_score),n_jobs=-1)
rand_forest_grid.fit(X_train, y_train)

print(rand_forest_grid.best_params_)
print(rand_forest_grid.best_score_)

y_pred = rand_forest_grid.predict(X_test)

print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}')

print(f'Accuracy Score: {accuracy_score(y_test,y_pred)}')
print(f'Recall score: {recall_score(y_test,y_pred)}')
print(f'F1 score: {f1_score(y_test,y_pred)}')

{'n_estimators': 57}
0.6394015758165789
Confusion Matrix: 
[[6377   17]
 [ 267  245]]
Accuracy Score: 0.9588763394150015
Recall score: 0.478515625
F1 score: 0.6330749354005168


In [16]:
voting = VotingClassifier(estimators=[('lg1', lg1), ('tree', tree_grid), ('rand_forest_grid', rand_forest_grid)], voting='soft',n_jobs=-1)
voting.fit(X_train, y_train)

y_pred = voting.predict(X_test)

print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}')

print(f'Accuracy Score: {accuracy_score(y_test,y_pred)}')
print(f'Recall score: {recall_score(y_test,y_pred)}')
print(f'F1 score: {f1_score(y_test,y_pred)}')

Confusion Matrix: 
[[6291  103]
 [ 194  318]]
Accuracy Score: 0.9569939183318853
Recall score: 0.62109375
F1 score: 0.6816720257234727
