In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
import joblib

In [2]:
df = pd.read_csv('analyzed_data.csv')
y = df.depression
x = df.drop('depression', axis=1)
sc_x = pd.DataFrame(StandardScaler().fit_transform(x), columns = x.columns)

In [3]:
def alg_evaluation(x, y, type = 'binary'):

    score_type = 'f1'
    
    if type != 'binary': score_type = 'f1_macro'
    
    alg_dict = {'lr': LogisticRegression(),
                'svm': SVC(),
                'tree': DecisionTreeClassifier(),
                'knn': KNeighborsClassifier(),
                'nb': GaussianNB(),
                'rf': RandomForestClassifier(),
                'gb': GradientBoostingClassifier()}

    alg_list = ['lr', 'svm', 'tree', 'knn', 'nb', 'rf', 'gb']

    cv_score_list = []

    for alg in alg_list:
        cm = alg_dict[alg]
        cv_score = cross_val_score(cm, x, y, cv=5, scoring = score_type)
        cv_score_list.append(cv_score.mean())

    eval_df = pd.DataFrame({'algorithm': alg_list, 'mean_f1_score': cv_score_list})
    eval_df = eval_df.sort_values(by = 'mean_f1_score', ascending = False).reset_index(drop = True)
    return(eval_df)

In [4]:
alg_evaluation(sc_x, y)

Unnamed: 0,algorithm,mean_f1_score
0,lr,0.902586
1,svm,0.833003
2,gb,0.793686
3,rf,0.710445
4,nb,0.682967
5,knn,0.675496
6,tree,0.657444


In [5]:
def train_test_evaluation(x, y, model, n_rounds):
    
    cm = model

    f1_list = []

    class_list = np.sort(y.unique())

    for i in range(n_rounds):
        
        tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size = 0.25)
        cm.fit(tr_x, tr_y)
        yhat = cm.predict(te_x)
        conf_matrix = confusion_matrix(te_y, yhat, labels = class_list)

        avg_f1 = 0
        
        for cl in class_list:
            m1 = conf_matrix[cl, cl]/conf_matrix[:, cl].sum()
            m2 = conf_matrix[cl, cl]/conf_matrix[cl, :].sum()
            f1 = 2*m1*m2/(m1+m2)
            avg_f1 = avg_f1 + f1

        avg_f1 = avg_f1/len(class_list)

        f1_list.append(avg_f1)

    eval_df = pd.DataFrame({'avg_f1': f1_list})
    return(eval_df.describe())

In [6]:
cm = LogisticRegression()
train_test_evaluation(sc_x, y, cm, 10)

Unnamed: 0,avg_f1
count,10.0
mean,0.947965
std,0.009873
min,0.934978
25%,0.94031
50%,0.947741
75%,0.952188
max,0.966767


In [7]:
yhat = cross_val_predict(cm, sc_x, y, cv=5)
print(classification_report(y, yhat))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1851
           1       0.94      0.87      0.90       203

    accuracy                           0.98      2054
   macro avg       0.96      0.93      0.95      2054
weighted avg       0.98      0.98      0.98      2054



In [8]:
final_model = LogisticRegression().fit(sc_x, y)
joblib.dump(final_model, 'final_model.pkl')

['final_model.pkl']