# Install packages

First thing is to install pandas, numpy, scikit-learn, matplotlib, and seaborn. Then below you see where we import the packages in our notebook.

In [1]:
import pandas as pd
import numpy as np
import datetime as dt

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV

from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, ElasticNet, Lasso
from sklearn.metrics import classification_report, mean_squared_error, confusion_matrix, plot_confusion_matrix, accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, roc_curve, auc
from sklearn.preprocessing import StandardScaler



# Import the data and start exploring it

In [42]:

data = pd.read_csv('../Project Retain Alabama/Survey Data and Info/UABformatChange.csv')
data.head()

Unnamed: 0,University,GradTime,FTPT,Distance,Full,STEM,Alcareerlearn,StayAL,AL Resident,StateHS,...,AL Resident.1,StateHS.1,StateHScode.1,ALHS.1,Gender.1,Race.1,Race Code.1,Age.1,Disability.1,FirstGEn.1
0,9,2021,Full-time Distance Education,1,1,0,1,0,1,Alabama,...,1.0,Alabama,9,3.0,1,Two or More Races,5,2,0.0,0.0
1,9,2021,Full-time Distance Education,1,1,0,0,3,1,Alabama,...,1.0,Alabama,9,3.0,1,Asian,1,3,0.0,0.0
2,9,2022,Full-time Distance Education,1,1,0,1,0,1,Alabama,...,1.0,Alabama,9,3.0,1,Black or African American,2,2,0.0,1.0
3,9,2022,Full-time Distance Education,1,1,0,1,1,1,Alabama,...,1.0,Alabama,9,10.0,1,White,4,2,0.0,0.0
4,9,2022,Full-time Distance Education,1,1,0,1,3,1,Alabama,...,1.0,Alabama,9,3.0,1,Other or Unknown,6,2,0.0,1.0


Some commands you could start with : 
- data.describe()
- data.shape
- data.isnull().sum()
- data['column name'].value_counts()

# Transform the Data for Regression

In [51]:
# Use get dummies for one hot encoding 
# pd.get_dummies(data.University, prefix='University Code')
rd = pd.get_dummies(data, dummy_na=True)
rd

# print(data)

Unnamed: 0,University,GradTime,Distance,Full,STEM,Alcareerlearn,StayAL,AL Resident,StateHScode,ALHS,...,StateHS.1_nan,Race.1_American Indian or Alaska Native,Race.1_Asian,Race.1_Black or African American,Race.1_Hispanic or Latino,Race.1_Native Hawaiian or Other Pacific Islander,Race.1_Other or Unknown,Race.1_Two or More Races,Race.1_White,Race.1_nan
0,9,2021,1,1,0,1,0,1,9,3.0,...,0,0,0,0,0,0,0,1,0,0
1,9,2021,1,1,0,0,3,1,9,3.0,...,0,0,1,0,0,0,0,0,0,0
2,9,2022,1,1,0,1,0,1,9,3.0,...,0,0,0,1,0,0,0,0,0,0
3,9,2022,1,1,0,1,1,1,9,10.0,...,0,0,0,0,0,0,0,0,1,0
4,9,2022,1,1,0,1,3,1,9,3.0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1201,9,2022,0,1,0,1,0,0,3,99.0,...,0,0,1,0,0,0,0,0,0,0
1202,9,2023,0,1,0,0,1,0,10,99.0,...,0,0,0,0,0,0,0,0,1,0
1203,9,2023,0,1,0,1,3,0,10,99.0,...,0,0,0,1,0,0,0,0,0,0
1204,9,2021,1,0,1,0,3,0,10,,...,0,0,0,0,0,0,0,0,1,0


In [18]:
# A useful function for changing strings 
for i in list(data.columns):
    data[i] = data[i].apply(lambda x: str(x)).apply(lambda x: x.replace('|' , ','))

In [15]:
# renaming columns

data = data.rename(columns = {'University_9_1': 'UAB'})
print(data)
# Regression

      University  GradTime                          FTPT  Distance  Full  \
0              9      2022  Full-time Distance Education       NaN     1   
1              9      2021  Full-time Distance Education       NaN     1   
2              9      2023  Full-time Distance Education       NaN     1   
3              9      2021  Full-time Distance Education       NaN     1   
4              9      2023  Part-time Distance Education       NaN     0   
...          ...       ...                           ...       ...   ...   
1202           9      2023  Part-time Distance Education       NaN     0   
1203           9      2022           Full-time On-Campus       NaN     0   
1204           9      2021           Full-time On-Campus       NaN     0   
1205           9      2022           Part-time On-Campus       NaN     0   
1206           9      2022           Full-time On-Campus       NaN     0   

      STEM  Alcareerlearn  StayAL  AL Resident  StateHS  ...  AL Resident.1  \
0      1

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(X, y)
lin_reg.intercept_, lin_reg.coef_

# Model Validation

# 