In [39]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, recall_score,make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import datetime
import time

pd.set_option('display.max_columns', None)


In [51]:
final_data_df = pd.read_csv('final_data.csv')

final_data_df['v1'] = (final_data_df['featured_collection_count'] / final_data_df['favorites_count']).fillna(0)
final_data_df['v2'] = (final_data_df['comments_count'] / (final_data_df['discussions_count']+1)).fillna(0)
final_data_df['v3'] = (final_data_df['visitors_count'] / (final_data_df['discussions_count']+1)).fillna(0)
final_data_df['v4'] = (final_data_df['visitors_count'] / (final_data_df['comments_count']+1)).fillna(0)
final_data_df['v5'] = (final_data_df['screenshot_count'] / (final_data_df['media_count']+1)).fillna(0)

final_data_df['posted_time'] = final_data_df['posted_time'].map(lambda ms:datetime.datetime.fromtimestamp(ms).year)
final_data_df['updated_time'] = final_data_df['updated_time'].map(lambda ms:datetime.datetime.fromtimestamp(ms).year)
final_data_df['passed_time'] = pd.cut(final_data_df['passed_time'],bins=10,labels=range(0,10))

#final_data_df['v6'] = final_data_df['class_spy'] & final_data_df['class_medic'] 
#final_data_df['v7'] = final_data_df['class_engineer'] & final_data_df['class_spy']
#final_data_df['v8'] = final_data_df['item_slot_weapon'] & final_data_df['item_slot_headgear']

X = final_data_df.drop(columns=['accepted']).copy()
y = final_data_df['accepted'].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=13)

final_data_df.head()

Unnamed: 0,discussions_count,comments_count,movie_count,screenshot_count,awards_count,link_count,file_size_mb,change_notes,certified_compatible,creator_count,featured_collection_count,visitors_count,favorites_count,accepted,posted_time,updated_time,passed_time,media_count,class_count,item_slot_count,submitted_by_submitter,class_scout,class_soldier,class_pyro,class_demoman,class_heavy,class_engineer,class_medic,class_sniper,class_spy,item_slot_headgear,item_slot_misc,item_slot_taunt,item_slot_weapon,v1,v2,v3,v4,v5
0,0,154,0,6,0,0,4.971,7,1,2,115,5492,428,1,2012,2016,3,6,5,1,380,1,0,0,1,1,1,0,1,0,0,1,0,0,0.268692,154.0,5492.0,35.432258,0.857143
1,0,40,0,4,0,0,2.958,4,1,2,46,2218,173,1,2012,2015,2,4,1,1,380,1,0,0,0,0,0,0,0,0,0,1,0,0,0.265896,40.0,2218.0,54.097561,0.8
2,0,238,0,2,0,0,4.277,1,0,4,100,11905,859,1,2012,2012,0,2,1,1,153,0,0,0,0,0,0,0,0,1,0,1,0,0,0.116414,238.0,11905.0,49.811715,0.666667
3,0,237,0,8,0,0,4.567,3,0,2,91,10085,650,1,2012,2012,0,8,9,1,114,1,1,1,1,1,1,1,1,1,0,1,0,0,0.14,237.0,10085.0,42.37395,0.888889
4,3,773,0,0,0,0,0.572,1,0,1,17,33200,102,1,2012,2012,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0.166667,193.25,8300.0,42.894057,0.0


In [52]:
lg1 = LogisticRegression(max_iter=1000000, random_state=13,class_weight={0: 1, 1: 4})
lg1.fit(X_train, y_train)

y_pred = lg1.predict(X_test)

print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}')

print(f'Accuracy Score: {accuracy_score(y_test,y_pred)}')
print(f'Recall score: {recall_score(y_test,y_pred)}')
print(f'F1 score: {f1_score(y_test,y_pred)}')

Confusion Matrix: 
[[5803  294]
 [ 210  323]]
Accuracy Score: 0.9239819004524887
Recall score: 0.6060037523452158
F1 score: 0.5617391304347826


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [37]:
tree = DecisionTreeClassifier()
grid = GridSearchCV(tree, dict(max_depth=range(2,100,2), min_samples_split=range(5,100,5),), scoring=make_scorer(f1_score))
grid.fit(X_train, y_train)

print(grid.best_score_)

y_pred = grid.predict(X_test)

print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}')

print(f'Accuracy Score: {accuracy_score(y_test,y_pred)}')
print(f'Recall score: {recall_score(y_test,y_pred)}')
print(f'F1 score: {f1_score(y_test,y_pred)}')


0.6314209488868289
Confusion Matrix: 
[[5926  171]
 [ 224  309]]
Accuracy Score: 0.9404223227752639
Recall score: 0.5797373358348968
F1 score: 0.6100691016781837


In [38]:
grid.best_params_

{'max_depth': 14, 'min_samples_split': 5}