## 1. Set up

### 1 - Drive

1 - Mount Drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


2 - Move to the data folder

In [None]:
cd "gdrive/MyDrive/Projects/1 - Numericals/Autism Prediction Challenge/2 - Production/data"

/content/gdrive/MyDrive/Projects/1 - Numericals/Autism Prediction Challenge/2 - Production/data


### 2. Libraries

In [None]:
# Load data
import pandas as pd
import numpy as np
import io
import os
import glob

# Meta
import time

# Visualizations
import matplotlib.pyplot as plt
import seaborn as sb
from matplotlib.lines import Line2D

# Split Data
from sklearn.model_selection import train_test_split

# Modeling
import statsmodels.api as sn
from sklearn.linear_model import LogisticRegression

# Metrics
from sklearn.metrics import precision_score, recall_score, accuracy_score, confusion_matrix, roc_auc_score


## 3. Data

1 - List file names

In [None]:
ls

[0m[01;34mAutism-prediction[0m/     l1_clean_encoded.csv   sample_submission.csv  train.csv
autism-prediction.zip  l1_clean_unecoded.csv  test.csv


In [None]:
# Load sales_train in both encoded and unencoded forms

df_1 = pd.read_csv('l1_clean_encoded.csv')
df_2 = pd.read_csv('l1_clean_unecoded.csv')


## 4. Logistic Regression

### 1 - Split data

1 - Split into X and y features

In [None]:
X = df_1.drop(['Class/ASD'], axis=1)
y = df_1['Class/ASD']

2 - Split data into training and validation

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(X,
                                                  y,
                                                  test_size = 0.2, 
                                                  random_state = 0)

### 2 - Look at coefficients of the features

In [None]:
x_train_1 = df_1.copy(deep=False)

In [None]:
x_train_1['intercept'] = 1

In [None]:
x_train_1.columns

Index(['Unnamed: 0', 'ID', 'A1_Score', 'A2_Score', 'A3_Score', 'A4_Score',
       'A5_Score', 'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score',
       'age', 'gender', 'ethnicity', 'jaundice', 'fam_history_autism',
       'country', 'used_app_before', 'final_score', 'survey_done_by',
       'Class/ASD', 'intercept'],
      dtype='object')

In [None]:
logit_model = sn.Logit(x_train_1['Class/ASD'], x_train_1[['intercept','ID', 'A1_Score', 'A2_Score', 'A3_Score', 'A4_Score',
       'A5_Score', 'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score',
       'age', 'gender', 'ethnicity', 'jaundice', 'fam_history_autism',
       'country', 'used_app_before', 'final_score', 'survey_done_by']])

In [None]:
results = logit_model.fit()

Optimization terminated successfully.
         Current function value: 0.276997
         Iterations 8


In [None]:
results.summary()

0,1,2,3
Dep. Variable:,Class/ASD,No. Observations:,800.0
Model:,Logit,Df Residuals:,779.0
Method:,MLE,Df Model:,20.0
Date:,"Sat, 19 Mar 2022",Pseudo R-squ.:,0.4878
Time:,06:31:46,Log-Likelihood:,-221.6
converged:,True,LL-Null:,-432.63
Covariance Type:,nonrobust,LLR p-value:,5.369e-77

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept,-4.6761,0.877,-5.332,0.000,-6.395,-2.957
ID,0.0006,0.001,1.130,0.259,-0.000,0.002
A1_Score,0.2769,0.304,0.911,0.363,-0.319,0.873
A2_Score,0.3077,0.266,1.158,0.247,-0.213,0.828
A3_Score,0.8937,0.283,3.156,0.002,0.339,1.449
A4_Score,0.9784,0.319,3.067,0.002,0.353,1.604
A5_Score,0.5098,0.315,1.617,0.106,-0.108,1.128
A6_Score,0.8841,0.282,3.135,0.002,0.331,1.437
A7_Score,0.3354,0.272,1.233,0.218,-0.198,0.869


In [None]:
# Get coefficients of features
coefficient_list = [0.0006, 0.2769, 0.3077, 0.8937, 0.9784, 0.5098, 0.8841, 0.3354, 0.0506, 0.5661, 0.7540, -0.0047, 0.1718, 0.0493, 0.4929, 0.6547, -0.0023, -0.6446, 0.0966, -0.2110]

In [None]:
# Create feature list
feature_list = x_train_1.columns
feature_list = feature_list.to_list()
feature_list.remove('Class/ASD')
feature_list.remove('intercept')
feature_list.remove('Unnamed: 0')

In [None]:
# Create exponentials of the coefficients
expof_list = []
for i in coefficient_list:
    expof = np.exp(i)
    expof = round(expof, 4)
    expof_list.append(expof)


In [None]:
expof_list = list(zip(feature_list, expof_list))

In [None]:
# Take the reciprocal of expofs of age, survey_done_by, country since they are less than 0
expof_list

[('ID', 1.0006),
 ('A1_Score', 1.319),
 ('A2_Score', 1.3603),
 ('A3_Score', 2.4442),
 ('A4_Score', 2.6602),
 ('A5_Score', 1.665),
 ('A6_Score', 2.4208),
 ('A7_Score', 1.3985),
 ('A8_Score', 1.0519),
 ('A9_Score', 1.7614),
 ('A10_Score', 2.1255),
 ('age', 0.9953),
 ('gender', 1.1874),
 ('ethnicity', 1.0505),
 ('jaundice', 1.6371),
 ('fam_history_autism', 1.9246),
 ('country', 0.9977),
 ('used_app_before', 0.5249),
 ('final_score', 1.1014),
 ('survey_done_by', 0.8098)]

In [None]:
expof_age = 1/np.exp(	-0.0047)
print(expof_age)

1.0047110623241844


In [None]:
expof_country = 1/np.exp(-0.0023)
print(expof_country)

1.002302647029


In [None]:
expof_used_app_before = 1/np.exp(	-0.6446)
print(expof_used_app_before)

1.9052247869188474


In [None]:
expof_survey_done_by = 1/np.exp(-0.2110)
print(expof_survey_done_by)

1.2349123550613943


## 3. Model data

In [None]:
model = LogisticRegression()
model.fit(x_train, y_train)
predictions = model.predict(x_valid)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
print('Precision:',round(precision_score(y_valid, predictions),2))
print('Recall:',round(recall_score(y_valid, predictions),2))
print('Accuracy score:',round(accuracy_score(y_valid, predictions),2))

Precision: 0.73
Recall: 0.62
Accuracy score: 0.85


In [None]:
confusion_matrix(y_valid, predictions)

array([[112,   9],
       [ 15,  24]])

In [None]:
roc_auc_score(y_valid, model.predict_proba(x_valid)[:, 1])

0.9139648230557321

In [None]:
roc_auc_score(y_valid, model.decision_function(x_valid))

0.9139648230557321