In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import os
import numpy as numpy
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

# File to Load (Remember to Change These)
train_data = "cleaned_train.csv"

In [2]:
df_train_clean = pd.read_csv(train_data)

In [3]:
df_train_clean[0:5]

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,...,country_of_residence_United States,country_of_residence_Viet Nam,used_app_before_no,used_app_before_yes,test_taker_?,test_taker_Health care professional,test_taker_Others,test_taker_Parent,test_taker_Relative,test_taker_Self
0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [4]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = df_train_clean['Autism_Diagnosis']

# Separate the X variable, the features
X = df_train_clean.drop(columns='Autism_Diagnosis')

In [5]:
### Check the balance of the labels variable (`y`) by using the `value_counts` function.

In [6]:
y.value_counts()

0.0    615
1.0    185
Name: Autism_Diagnosis, dtype: int64

In [7]:
string_columns = df_train_clean.select_dtypes(include='object')

columns_with_strings = string_columns.columns

print(columns_with_strings)

Index([], dtype='object')


In [8]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [9]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train = X_scaler.transform(X_train)
X_test = X_scaler.transform(X_test)

In [10]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)
classifier

In [11]:
# Fit the model using training data
logistic_regression_model = LogisticRegression(random_state=1)

logistic_regression_model.fit(X_train, y_train)

In [12]:
# Make a prediction using the Training data
training_predictions = logistic_regression_model.predict(X_train)

#Generate testing predictions
testing_predictions = logistic_regression_model.predict(X_test)

In [13]:
import pickle

with open("Full_model.pkl", "wb") as f:
    pickle.dump(logistic_regression_model, f)

In [14]:
with open("Full_model.pkl", "rb") as f:
    model2 = pickle.load(f)

In [15]:
# Print the balanced_accuracy score of the model
from sklearn.metrics import accuracy_score
# Display the accuracy score for the test dataset.
accuracy_score(y_test, testing_predictions)

0.845

In [16]:
# Generate a Training confusion matrix for the model
training_matrix = confusion_matrix(y_train, training_predictions)
print(training_matrix)

[[447  20]
 [ 29 104]]


In [17]:
# Generate a Testing confusion matrix for the model
testing_matrix = confusion_matrix(y_test, testing_predictions)
print(testing_matrix)

[[134  14]
 [ 17  35]]


In [18]:
# Print the Training classification report for the model
training_report = classification_report(y_train, training_predictions)
print(training_report)

              precision    recall  f1-score   support

         0.0       0.94      0.96      0.95       467
         1.0       0.84      0.78      0.81       133

    accuracy                           0.92       600
   macro avg       0.89      0.87      0.88       600
weighted avg       0.92      0.92      0.92       600



In [19]:
# Print the Testing classification report for the model
testing_report = classification_report(y_test, testing_predictions)
print(testing_report)

              precision    recall  f1-score   support

         0.0       0.89      0.91      0.90       148
         1.0       0.71      0.67      0.69        52

    accuracy                           0.84       200
   macro avg       0.80      0.79      0.79       200
weighted avg       0.84      0.84      0.84       200



In [20]:
## Predict a Logistic Regression Model with Resampled Training Data

In [21]:
### Step 1: Use the `RandomOverSampler` module from the imbalanced-learn library to resample the data. Be sure to confirm that the labels have an equal number of data points.

In [22]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
ROS = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
X_resampled, y_resampled= ROS.fit_resample(X_train, y_train)

In [23]:
# Count the distinct values of the resampled labels data
y_resampled.value_counts()

1.0    467
0.0    467
Name: Autism_Diagnosis, dtype: int64

In [24]:
# X_train1, X_test1, y_train1, y_test1 = train_test_split(X_resampled, y_resampled, random_state=1)

In [25]:
### Step 2: Use the `LogisticRegression` classifier and the resampled data to fit the model and make predictions.

In [27]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifierR = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)
classifierR
# Fit the model using the resampled training data
logistic_regression_model = LogisticRegression(random_state=1)

logistic_regression_model.fit(X_resampled, y_resampled)

# Make a prediction using the Training data
training_predictions1 = logistic_regression_model.predict(X_resampled)

# Make a prediction using the testing data
testing_predictions1 = logistic_regression_model.predict(X_test)

In [28]:
import pickle

with open("resampled_Full_model.pkl", "wb") as f:
    pickle.dump(logistic_regression_model, f)

In [29]:
with open("resampled_Full_model.pkl", "rb") as f:
    model2 = pickle.load(f)

In [30]:
### Step 3: Evaluate the model’s performance by doing the following:

#* Calculate the accuracy score of the model.

#* Generate a confusion matrix.

#* Print the classification report.

In [32]:
# Print the balanced_accuracy score of the model
accuracy_score(y_test, testing_predictions1)

0.82

In [34]:
# Generate a Training confusion matrix for the model
training_matrix = confusion_matrix(y_resampled, training_predictions1)
print(training_matrix)

[[403  64]
 [ 63 404]]


In [36]:
# Generate a Testing confusion matrix for the model
testing_matrix = confusion_matrix(y_test, testing_predictions1)
print(testing_matrix)

[[126  22]
 [ 14  38]]


In [37]:
# Print the Training classification report for the model
training_report = classification_report(y_resampled, training_predictions1)
print(training_report)

              precision    recall  f1-score   support

         0.0       0.86      0.86      0.86       467
         1.0       0.86      0.87      0.86       467

    accuracy                           0.86       934
   macro avg       0.86      0.86      0.86       934
weighted avg       0.86      0.86      0.86       934



In [38]:
# Print the Testing classification report for the model
testing_report = classification_report(y_test, testing_predictions1)
print(testing_report)

              precision    recall  f1-score   support

         0.0       0.90      0.85      0.88       148
         1.0       0.63      0.73      0.68        52

    accuracy                           0.82       200
   macro avg       0.77      0.79      0.78       200
weighted avg       0.83      0.82      0.82       200

