# Install packages

First thing is to install pandas, numpy, scikit-learn, matplotlib, and seaborn. Then below you see where we import the packages in our notebook.

In [2]:
import pandas as pd
import numpy as np
import datetime as dt

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV

from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, ElasticNet, Lasso
from sklearn.metrics import classification_report, mean_squared_error, confusion_matrix, plot_confusion_matrix, accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, roc_curve, auc
from sklearn.preprocessing import StandardScaler

# allow multiple outputs per cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


# Import the data and start exploring it

In [3]:

data = pd.read_csv('../Project Retain Alabama/Survey Data and Info/UABformatChange.csv')
alabamaData = pd.read_csv('../Project Retain Alabama/Survey Data and Info/alabamaData.csv')
alabamaData.describe()

Unnamed: 0,University,GradTime,Distance,Full,STEM,Alcareerlearn,StayAL,AL Resident,StateHScode,ALHS,...,I_c_sporting,I_c_outdoor,AL Resident.1,StateHScode.1,ALHS.1,Gender.1,Race Code.1,Age.1,Disability.1,FirstGEn.1
count,8208.0,8208.0,8208.0,8208.0,8208.0,8208.0,8208.0,8208.0,8208.0,7855.0,...,8208.0,8208.0,5274.0,8208.0,7855.0,8208.0,8208.0,8208.0,8171.0,8185.0
mean,7.992446,2021.986111,0.316155,0.875975,0.306043,0.725268,1.630361,0.642544,7.299464,36.898027,...,3.467105,4.528996,1.0,7.299464,36.898027,1.33711,3.498294,2.486964,0.051524,0.383262
std,3.226881,0.831072,0.465003,0.329631,0.460876,0.446406,1.250832,0.47928,2.649684,43.085221,...,1.629699,0.997566,0.0,2.649684,43.085221,0.487471,1.025683,0.934468,0.221077,0.486211
min,1.0,2021.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
25%,6.0,2021.0,0.0,1.0,0.0,0.0,1.0,0.0,4.0,5.0,...,3.0,5.0,1.0,4.0,5.0,1.0,3.0,2.0,0.0,0.0
50%,8.0,2022.0,0.0,1.0,0.0,1.0,1.0,1.0,9.0,10.0,...,3.0,5.0,1.0,9.0,10.0,1.0,4.0,2.0,0.0,0.0
75%,10.0,2023.0,1.0,1.0,1.0,1.0,3.0,1.0,9.0,99.0,...,5.0,5.0,1.0,9.0,99.0,2.0,4.0,3.0,0.0,1.0
max,14.0,2023.0,1.0,1.0,1.0,1.0,3.0,1.0,10.0,99.0,...,5.0,5.0,1.0,10.0,99.0,3.0,6.0,7.0,1.0,1.0


Some commands you could start with : 
- data.describe()
- data.shape
- data.isnull().sum()
- data['column name'].value_counts()

# Transform the Data for Regression

In [4]:
data = data.replace(np.NAN, 0)
alabamaData = alabamaData.replace(np.NAN, 0)
data = data.loc[:,~data.columns.duplicated()]
alabamaData = alabamaData.loc[:,~alabamaData.columns.duplicated()]

In [5]:
# Use get dummies for one hot encoding 
# pd.get_dummies(data.University, prefix='University Code')
readData = pd.get_dummies(data, columns=['Race','StateHS','Major','Major Code', 'Major Code Revised'])
readAlabamaData = pd.get_dummies(alabamaData, columns=['Race','StateHS','Major','Major Code', 'Major Code Revised'])

readData
readAlabamaData.describe()
# data = pd.concat([data, rd], axis=1)
# data
# print(data)

Unnamed: 0,University,GradTime,FTPT,Distance,Full,STEM,Alcareerlearn,StayAL,AL Resident,StateHScode,...,"Major Code_Other, Social Sciences","Major Code_Science, Technology, Engineering and Mathematics","Major Code_Transportation, Distribution and Logistics","Major Code Revised_Arts, A/V Technology and Communications",Major Code Revised_Business Management and Administration,Major Code Revised_Education and Training,Major Code Revised_Health Sciences,Major Code Revised_Human Services,"Major Code Revised_Law, Public Safety, Public Administration","Major Code Revised_Science, Technology, Engineering and Mathematics"
0,9,2021,Full-time Distance Education,1,1,0,1,0,1,9,...,0,0,0,1,0,0,0,0,0,0
1,9,2021,Full-time Distance Education,1,1,0,0,3,1,9,...,0,0,0,1,0,0,0,0,0,0
2,9,2022,Full-time Distance Education,1,1,0,1,0,1,9,...,0,0,0,1,0,0,0,0,0,0
3,9,2022,Full-time Distance Education,1,1,0,1,1,1,9,...,0,0,0,1,0,0,0,0,0,0
4,9,2022,Full-time Distance Education,1,1,0,1,3,1,9,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1201,9,2022,Full-time On-Campus,0,1,0,1,0,0,3,...,0,0,0,0,0,0,1,0,0,0
1202,9,2023,Full-time On-Campus,0,1,0,0,1,0,10,...,0,0,0,0,0,0,1,0,0,0
1203,9,2023,Full-time On-Campus,0,1,0,1,3,0,10,...,0,0,0,0,0,0,1,0,0,0
1204,9,2021,Part-time Distance Education,1,0,1,0,3,0,10,...,0,0,0,0,0,0,0,0,0,1


Unnamed: 0,University,GradTime,Distance,Full,STEM,Alcareerlearn,StayAL,AL Resident,StateHScode,ALHS,...,"Major Code_Other, Social Sciences","Major Code_Science, Technology, Engineering and Mathematics","Major Code_Transportation, Distribution and Logistics","Major Code Revised_Arts, A/V Technology and Communications",Major Code Revised_Business Management and Administration,Major Code Revised_Education and Training,Major Code Revised_Health Sciences,Major Code Revised_Human Services,"Major Code Revised_Law, Public Safety, Public Administration","Major Code Revised_Science, Technology, Engineering and Mathematics"
count,8208.0,8208.0,8208.0,8208.0,8208.0,8208.0,8208.0,8208.0,8208.0,8208.0,...,8208.0,8208.0,8208.0,8208.0,8208.0,8208.0,8208.0,8208.0,8208.0,8208.0
mean,7.992446,2021.986111,0.316155,0.875975,0.306043,0.725268,1.630361,0.642544,7.299464,35.31116,...,0.009259,0.224415,0.00999,0.076633,0.187135,0.144737,0.145833,0.126096,0.048002,0.271564
std,3.226881,0.831072,0.465003,0.329631,0.460876,0.446406,1.250832,0.47928,2.649684,42.808088,...,0.095784,0.417222,0.099457,0.266024,0.390043,0.351857,0.352961,0.331978,0.213783,0.444793
min,1.0,2021.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.0,2021.0,0.0,1.0,0.0,0.0,1.0,0.0,4.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,8.0,2022.0,0.0,1.0,0.0,1.0,1.0,1.0,9.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,10.0,2023.0,1.0,1.0,1.0,1.0,3.0,1.0,9.0,99.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,14.0,2023.0,1.0,1.0,1.0,1.0,3.0,1.0,10.0,99.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
readData['StayAL'].value_counts()
alabamaData['StayAL'].value_counts()

3    539
1    383
0    284
Name: StayAL, dtype: int64

3    3546
1    2744
0    1918
Name: StayAL, dtype: int64

In [None]:
# A useful function for changing strings 
# for i in list(data.columns):
#     data[i] = data[i].apply(lambda x: str(x)).apply(lambda x: x.replace('|' , ','))
# data = data[~data.columns.duplicated()]
alabamaData.describe()

In [None]:
# renaming columns
pd.options.display.max_columns = 300
pd.options.display.max_rows = 10
# readData
readAlabamaData
# data = data.rename(columns = {'_Virginia': 'Virginia'})
# data
# Regressiona

In [None]:
x = alabamaData.iloc[:, 0].values.reshape(-1,1) #original is 8208 rows
y = data.iloc[:, 1].values.reshape(-1,1) #1206 rows 
# fix above this line. 

lin_reg = LinearRegression()
lin_reg.fit(x, y)
lin_reg.intercept_, lin_reg.coef_
prayingToGod = lin_reg.predict(x)
# 
plt.scatter(x,y)
plt.plot(x,prayingToGod,color='blue')
plt.show()

In [None]:
# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)



# Model Validation

# 