In [6]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix,roc_curve, roc_auc_score, precision_score, recall_score, precision_recall_curve
from sklearn.metrics import f1_score
from tqdm.notebook import tqdm

In [7]:
df = pd.read_csv('temprament.csv')
df = df.dropna()
descriptions = df.iloc[0]
df = df.iloc[1:]

X = df.iloc[:,11:73]
y = df['adhd']
print(y.value_counts())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

# Logistic Regression

In [10]:
model = LogisticRegression(class_weight={0:1258,1:9073})
model.fit(X_train,y_train)

importance = model.coef_[0]

coefficients = []
for i in range(len(importance)):
    coefficients.append((X.columns[i],importance[i]))
    
to_show = 5  

sorted_coef = sorted(coefficients,key=lambda tup:abs(tup[1]))
bottom_5 = sorted_coef[:to_show]
top_5 = sorted_coef[-to_show:]

print(f'*** The {to_show} least predictive measures were (odds ratio): ***')
for index,(label,value) in enumerate(bottom_5):
    print(f"{len(coefficients)-index}. {descriptions[label].split(' / ')[0]}: {round(np.exp(value),3)}")
print(f'\n*** The {to_show} most predictive measures were (odds ratio): ***')
for index,(label,value) in enumerate(top_5):
    print(f"{to_show-index}. {descriptions[label].split('. / ')[0]}: {round(np.exp(value),3)}")

LogisticRegression(class_weight={0: 1258, 1: 9073})

In [12]:
score = model.score(X_test,y_test)
print(score)

0.7276245766811804


In [13]:
scores = []

for j in range(1,len(sorted_coef)+1):
    
    result = []
    
    model = LogisticRegression(class_weight={0:1140,1:9275})
    labels = [entry[0] for entry in sorted_coef]
    subset = labels[-j:]

    X = df[subset]
    y = df['adhd']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    model.fit(X_train,y_train)

    y_pred = model.predict(X_test)
    matrix = confusion_matrix(y_test, y_pred)

    result.append(j)
    result.append(accuracy_score(y_test,y_pred))
    result.append(roc_auc_score(y_test, y_pred))
    result.append(recall_score(y_test,y_pred))
    for i in range(2):
        for k in range(2):
            result.append(matrix[i][k])
    
    #print(f"Top {j} features:")
    
    scores.append(result)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [14]:
scores_df = pd.DataFrame(scores)
scores_df.columns = ['features','accuracy','auc','recall','tn','fp','fn','tp']



plt.plot(scores_df['auc'],color='black')
plt.xlabel('Number of Features')
plt.ylabel('Area Under Curve')
plt.show()

In [17]:
scores_weight = []

for i in np.arange(0,2,0.02):
    
    result = []
    
    model = LogisticRegression(class_weight={0:i,1:1})
    labels = [entry[0] for entry in sorted_coef]

    X = df.iloc[:,11:73]
    y = df['adhd']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    model.fit(X_train,y_train)

    y_pred = model.predict(X_test)
    matrix = confusion_matrix(y_test, y_pred)

    result.append(i)
    result.append(accuracy_score(y_test,y_pred))
    result.append(roc_auc_score(y_test, y_pred))
    result.append(recall_score(y_test,y_pred))
    for i in range(2):
        for k in range(2):
            result.append(matrix[i][k])
    
    #print(f"Top {j} features:")
    
    scores_weight.append(result)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [18]:
scores_weight_df = pd.DataFrame(scores_weight)
scores_weight_df.columns = ['i','accuracy','auc','recall','tn','fp','fn','tp']

plt.figure(figsize=(12,9))
plt.subplot(2,1,1)
plt.plot(np.arange(0,2,0.02),scores_weight_df['auc'],color='black')
plt.xlabel('Weighting Ratio (:1)')
plt.ylabel('Area Under Curve')

       i  accuracy       auc    recall    tn    fp   fn   tp
0   0.00  0.119981  0.500000  1.000000     0  1819    0  248
1   0.02  0.314949  0.599599  0.980916   394  1411    5  257
2   0.04  0.441703  0.663885  0.955102   679  1143   11  234
3   0.06  0.557329  0.708984  0.911877   914   892   23  238
4   0.08  0.613449  0.696850  0.808429  1057   749   50  211
..   ...       ...       ...       ...   ...   ...  ...  ...
95  1.90  0.880987  0.528844  0.063745  1805    11  235   16
96  1.92  0.877117  0.515440  0.035294  1804     8  246    9
97  1.94  0.891630  0.524297  0.052402  1831     7  217   12
98  1.96  0.879052  0.515726  0.035857  1808     8  242    9
99  1.98  0.876633  0.513907  0.031128  1804     6  249    8

[100 rows x 8 columns]


scores_weight_zoom = []

for i in np.arange(0,0.5,0.001):
    
    result = []
    
    model = LogisticRegression(class_weight={0:i,1:1})
    labels = [entry[0] for entry in sorted_coef]

    X = df.iloc[:,11:73]
    y = df['adhd']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    model.fit(X_train,y_train)

    y_pred = model.predict(X_test)
    matrix = confusion_matrix(y_test, y_pred)

    result.append(i)
    result.append(accuracy_score(y_test,y_pred))
    result.append(roc_auc_score(y_test, y_pred))
    result.append(recall_score(y_test,y_pred))
    for i in range(2):
        for k in range(2):
            result.append(matrix[i][k])
    
    #print(f"Top {j} features:")
    
    scores_weight_zoom.append(result)

scores_weight_zoom_df = pd.DataFrame(scores_weight_zoom)
scores_weight_zoom_df.columns = ['i','accuracy','auc','recall','tn','fp','fn','tp']
print(scores_weight_zoom_df)

plt.subplot(2,1,2)
plt.plot(np.arange(0,0.5,0.001),scores_weight_zoom_df['auc'],color='black')
plt.xlabel('Weighting Ratio (:1)')
plt.ylabel('Area Under Curve')
plt.savefig('Figures/log_reg_training')
plt.show()

In [23]:
print(scores_weight_zoom_df.iloc[scores_weight_zoom_df.idxmax()[2]])

i              0.131000
accuracy       0.748428
auc            0.766835
recall         0.792388
tn          1318.000000
fp           460.000000
fn            60.000000
tp           229.000000
Name: 131, dtype: float64


# Support Vector Machine

In [24]:
svm_model = svm.SVC(class_weight={0:0.113,1:1})
svm_model.fit(X_train,y_train)

SVC(class_weight={0: 0.113, 1: 1})

In [25]:
svm_pred = svm_model.predict(X_test)
svm_matrix = confusion_matrix(y_test, svm_pred)

In [26]:
print(svm_matrix)

[[1257  532]
 [  58  220]]


In [27]:
svm_model.score(X_test,y_test)

0.7145621673923561

# Decision Forest

In [28]:
forest = RandomForestClassifier()
forest.fit(X_train,y_train)

RandomForestClassifier()

In [29]:
forest_pred = forest.predict(X_test)
forest_matrix = confusion_matrix(y_test, forest_pred)

In [30]:
print(forest_matrix)

[[1771   18]
 [ 266   12]]


In [31]:
forest.score(X_test,y_test)

0.8626028059990324