In [92]:
import pandas as pd
import os
import numpy as numpy
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

# File to Load (Remember to Change These)
train_data = "unweighted_ASQ_cleaned_train.csv"

In [93]:
df_train_clean = pd.read_csv(train_data)

In [94]:
df_train_clean[0:5]

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,Autism_Diagnosis
0,1,0,1,1,1,1,0,1,1,1,0
1,0,0,0,0,0,0,0,0,0,1,0
2,1,1,1,1,1,1,0,0,1,1,1
3,0,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,1,1,0


In [95]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = df_train_clean['Autism_Diagnosis']

# Separate the X variable, the features
X = df_train_clean.drop(columns='Autism_Diagnosis')

In [96]:
### Check the balance of the labels variable (`y`) by using the `value_counts` function.

In [97]:
y.value_counts()

0    615
1    185
Name: Autism_Diagnosis, dtype: int64

In [98]:
string_columns = df_train_clean.select_dtypes(include='object')

columns_with_strings = string_columns.columns

print(columns_with_strings)

Index([], dtype='object')


In [99]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [100]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)
classifier

In [101]:
# Fit the model using training data
logistic_regression_model = LogisticRegression(random_state=1)

logistic_regression_model.fit(X_train, y_train)

In [102]:
# Make a prediction using the Training data
training_predictions = logistic_regression_model.predict(X_train)

#Generate testing predictions
testing_predictions = logistic_regression_model.predict(X_test)

In [103]:
import pickle

with open("unweighted_ASQ_model.pkl", "wb") as f:
    pickle.dump(logistic_regression_model, f)

In [104]:
with open("unweighted_ASQ_model.pkl", "rb") as f:
    model2 = pickle.load(f)

In [105]:
# Print the balanced_accuracy score of the model
from sklearn.metrics import accuracy_score
# Display the accuracy score for the test dataset.
accuracy_score(y_test, testing_predictions)

0.85

In [106]:
# Generate a Training confusion matrix for the model
training_matrix = confusion_matrix(y_train, training_predictions)
print(training_matrix)

[[440  27]
 [ 41  92]]


In [107]:
# Generate a Testing confusion matrix for the model
testing_matrix = confusion_matrix(y_test, testing_predictions)
print(testing_matrix)

[[134  14]
 [ 16  36]]


In [108]:
# Print the Training classification report for the model
training_report = classification_report(y_train, training_predictions)
print(training_report)

              precision    recall  f1-score   support

           0       0.91      0.94      0.93       467
           1       0.77      0.69      0.73       133

    accuracy                           0.89       600
   macro avg       0.84      0.82      0.83       600
weighted avg       0.88      0.89      0.88       600



In [109]:
# Print the Testing classification report for the model
testing_report = classification_report(y_test, testing_predictions)
print(testing_report)

              precision    recall  f1-score   support

           0       0.89      0.91      0.90       148
           1       0.72      0.69      0.71        52

    accuracy                           0.85       200
   macro avg       0.81      0.80      0.80       200
weighted avg       0.85      0.85      0.85       200



In [110]:
## Predict a Logistic Regression Model with Resampled Training Data

In [111]:
### Step 1: Use the `RandomOverSampler` module from the imbalanced-learn library to resample the data. Be sure to confirm that the labels have an equal number of data points. 

In [112]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
ROS = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
X_resampled, y_resampled= ROS.fit_resample(X_train, y_train)

In [113]:
# Count the distinct values of the resampled labels data
y_resampled.value_counts()

1    467
0    467
Name: Autism_Diagnosis, dtype: int64

In [114]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_resampled, y_resampled, random_state=1)

In [115]:
### Step 2: Use the `LogisticRegression` classifier and the resampled data to fit the model and make predictions.

In [116]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifierR = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)
classifierR
# Fit the model using the resampled training data
logistic_regression_model = LogisticRegression(random_state=1)

logistic_regression_model.fit(X_train1, y_train1)

# Make a prediction using the Training data
training_predictions1 = logistic_regression_model.predict(X_train1)

# Make a prediction using the testing data
testing_predictions1 = logistic_regression_model.predict(X_test1)


In [117]:
##################

In [118]:
import pickle

with open("resampled_unweighted_ASQ_model.pkl", "wb") as f:
    pickle.dump(logistic_regression_model, f)

In [119]:
with open("resampled_unweighted_ASQ_model.pkl", "rb") as f:
    model2 = pickle.load(f)

In [120]:
### Step 3: Evaluate the model’s performance by doing the following:

#* Calculate the accuracy score of the model.

#* Generate a confusion matrix.

#* Print the classification report.

In [121]:
# Print the balanced_accuracy score of the model 
accuracy_score(y_test1, testing_predictions1)

0.8803418803418803

In [122]:
# Generate a Training confusion matrix for the model
training_matrix = confusion_matrix(y_train1, training_predictions1)
print(training_matrix)

[[294  61]
 [ 45 300]]


In [123]:
# Generate a Testing confusion matrix for the model
testing_matrix = confusion_matrix(y_test1, testing_predictions1)
print(testing_matrix)

[[100  12]
 [ 16 106]]


In [124]:
# Print the Training classification report for the model
training_report = classification_report(y_train1, training_predictions1)
print(training_report)

              precision    recall  f1-score   support

           0       0.87      0.83      0.85       355
           1       0.83      0.87      0.85       345

    accuracy                           0.85       700
   macro avg       0.85      0.85      0.85       700
weighted avg       0.85      0.85      0.85       700



In [125]:
# Print the Testing classification report for the model
testing_report = classification_report(y_test1, testing_predictions1)
print(testing_report)

              precision    recall  f1-score   support

           0       0.86      0.89      0.88       112
           1       0.90      0.87      0.88       122

    accuracy                           0.88       234
   macro avg       0.88      0.88      0.88       234
weighted avg       0.88      0.88      0.88       234

