## Prepare Libraries and Class Term Data

In [None]:
import pandas as pd
import numpy as np

### Merge three years of program data

In [None]:
df1 = pd.read_csv('Applied Stats Final Project/CASPA-2022-2023.csv')
df1 = df1.loc[df1['decision_code'] == 'Matriculated']
df1['Cycle'] = '2022-2023'
df1['Cohort'] = 'Class of 2025'

In [None]:
merge1 = pd.read_csv('Applied Stats Final Project/Student Grades-FA23.csv')
merge1 = merge1[['Textbox11', 'Textbox12', 'TermGPA']]
merge1 = merge1.rename(columns={'Textbox11':'last_name', 'Textbox12':'first_name'})
df1['last_name'] = df1['last_name'].str.lower()
merge1['last_name'] = merge1['last_name'].str.lower()
merged1 = pd.merge(df1, merge1, on='last_name', how='inner')
merged1 = merged1.drop_duplicates()

In [None]:
merged1.isna().sum()

In [None]:
df2 = pd.read_csv('Applied Stats Final Project/CASPA-2023-2024.csv')
df2 = df2.loc[df2['decision_code'] == 'Matriculated']
df2['Cycle'] = '2023-2024'
df2['Cohort'] = 'Class of 2026'

In [None]:
merge2 = pd.read_csv('Applied Stats Final Project/Student Grades-FA24.csv')
merge2 = merge2[['Textbox11', 'Textbox12', 'TermGPA']]
merge2 = merge2.rename(columns={'Textbox11':'last_name', 'Textbox12':'first_name'})
merged2 = pd.merge(df2, merge2, on='last_name', how='inner')
merged2 = merged2.drop_duplicates()

In [None]:
merged2.isna().sum()

In [None]:
df3 = pd.read_csv('Applied Stats Final Project/CASPA-2021-2022.csv')
df3 = df3.loc[df3['decision_code'] == 'Matriculated']
df3['Cycle'] = '2021-2022'
df3['Cohort'] = 'Class of 2024'

In [None]:
df3.columns

In [None]:
df3 = df3.drop(columns=['cumulative_undergraduate_total_gpa', 'post_baccalaureate_total_gpa', 'cumulative_undergraduate_science_total_gpa', 'prerequisite_recommended_coursework_gpa', 'bcp_totals_gpa', 'physics_gpa'])

In [None]:
merge3 = pd.read_csv('Applied Stats Final Project/Student Grades-FA22.csv')
merge3 = merge3[['Textbox11', 'Textbox12', 'TermGPA']]
merge3 = merge3.rename(columns={'Textbox11':'last_name', 'Textbox12':'first_name'})
merged3 = pd.merge(df3, merge3, on=['last_name', 'first_name'], how='inner')
merged3 = merged3.drop_duplicates()
merged3 = merged3.rename(columns={'first_name':'first_name_x'})

In [None]:
df = pd.concat([merged1, merged2, merged3], axis=0, ignore_index=True)

#### Remove all identifiers for confidentiality

In [None]:
df = df.drop(columns=['first_name_x', 'last_name', 'first_name_y'])

In [None]:
df.columns

In [None]:
#Confirm shape of dataframe is accurate
df.shape

### Additional data cleaning, check for NA values, drop unneeded columns, drop duplicates, rename columns

In [None]:
df.isna().sum()

In [None]:
df.groupby('Cohort')[['overall_science_total_gpa', 'overall_total_gpa']].mean().reset_index()

In [None]:
df = df.drop(columns=['post_baccalaureate_total_gpa', 'physics_gpa', 'baccalaureate_science_total_gpa'])

In [None]:
df = df.drop_duplicates()

In [None]:
df.isna().sum()

In [None]:
df = df.dropna(subset='TermGPA')

In [None]:
df.isna().sum()

In [None]:
df = df.rename(columns={'TermGPA':'FirstTermGPA'})

In [None]:
df.isna().sum()

### Check data types

In [None]:
df.dtypes

In [None]:
df['FirstTermGPA'] = df['FirstTermGPA'].astype('float')

In [None]:
df.describe()

In [None]:
df.columns

### Fill final NAs

In [None]:
df[['employment_experience_hours_total',
       'extracurricular_activities_experience_hours_total',
       'health_related_experience_experience_experience_hours_total',
       'leadership_experience_hours_total',
       'patient_care_experience_experience_hours_total',
       'research_experience_hours_total',
       'shadowing_experience_experience_hours_total',
       'teaching_experience_experience_hours_total',
       'volunteer_community_enrichment_experience_hours_total']] = df[['employment_experience_hours_total',
       'extracurricular_activities_experience_hours_total',
       'health_related_experience_experience_experience_hours_total',
       'leadership_experience_hours_total',
       'patient_care_experience_experience_hours_total',
       'research_experience_hours_total',
       'shadowing_experience_experience_hours_total',
       'teaching_experience_experience_hours_total',
       'volunteer_community_enrichment_experience_hours_total']].fillna('0')

### Final check of NA values

In [None]:
df.isna().sum()

In [None]:
df[['employment_experience_hours_total',
       'extracurricular_activities_experience_hours_total',
       'health_related_experience_experience_experience_hours_total',
       'leadership_experience_hours_total',
       'patient_care_experience_experience_hours_total',
       'research_experience_hours_total',
       'shadowing_experience_experience_hours_total',
       'teaching_experience_experience_hours_total',
       'volunteer_community_enrichment_experience_hours_total']] = df[['employment_experience_hours_total',
       'extracurricular_activities_experience_hours_total',
       'health_related_experience_experience_experience_hours_total',
       'leadership_experience_hours_total',
       'patient_care_experience_experience_hours_total',
       'research_experience_hours_total',
       'shadowing_experience_experience_hours_total',
       'teaching_experience_experience_hours_total',
       'volunteer_community_enrichment_experience_hours_total']].astype('int')

### Final check of data types

In [None]:
df.dtypes

### Descriptive Statistics

In [None]:
round(df.describe(), 2)

## Regression Analysis 1: Predictability of admissions criteria on first-term GPA

In [None]:
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt

### Define feature variables (un-comment to include/remove different GPA variables from model)

In [None]:
#baccalaureate GPA 
#X = df[['baccalaureate_total_gpa','health_related_experience_experience_experience_hours_total','patient_care_experience_experience_hours_total']]
#y = df['FirstTermGPA']

In [None]:
#prereq GPA 
X = df[['prerequisite_required_coursework_gpa','health_related_experience_experience_experience_hours_total','patient_care_experience_experience_hours_total']]
y = df['FirstTermGPA']

In [None]:
#baccalaureate GPA 
#var= df[['baccalaureate_total_gpa','health_related_experience_experience_experience_hours_total','patient_care_experience_experience_hours_total','FirstTermGPA']]

In [None]:
#prereq GPA 
var= df[['prerequisite_required_coursework_gpa','health_related_experience_experience_experience_hours_total','patient_care_experience_experience_hours_total','FirstTermGPA']]

In [None]:
corr = var.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')

### Fit model

In [None]:
from sklearn.linear_model import LinearRegression

term_model=LinearRegression()
term_model.fit(X, y)

## Checking Regression Assumptions

### R-Squared Value, Summary for first model

In [None]:
term_model_r2 = term_model.score(X, y)
print('R2: {0}'.format(term_model_r2))

In [None]:
import statsmodels.api as sm

X_const = sm.add_constant(X)

model_sm = sm.OLS(y, X_const).fit()

print(model_sm.summary())

### Define function to calculate residuals

In [None]:
def calculate_residuals(model, features, label):
    """
    Creates predictions on the features with the model and calculates residuals
    """
    predictions = model.predict(features)
    df_results = pd.DataFrame({'Actual': label, 'Predicted': predictions})
    df_results['Residuals'] = abs(df_results['Actual']) - abs(df_results['Predicted'])
    
    return df_results

In [None]:
calculate_residuals(term_model, X, y)

In [None]:
# Calculatie residuals for the plot
df_results = calculate_residuals(term_model, X, y)

### Assumption 1: Linearity

In [None]:
plt.figure(figsize=(8, 6))  # Set figure size
sns.scatterplot(x=df_results['Actual'], y=df_results['Predicted'], color='dodgerblue', alpha=0.7, edgecolor='black')

# Ideal prediction line
line_coords = np.arange(df_results.min().min(), df_results.max().max())
plt.plot(line_coords, line_coords, color='darkorange', linestyle='--', label='Ideal Fit')

# Set axis limits to start at 0
plt.xlim(0, df_results['Actual'].max() + 0.5)
plt.ylim(0, df_results['Predicted'].max() + 0.5)

# Titles & labels
plt.title('Actual vs. Predicted')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.legend()
plt.show()


### Assumption 2: Normality of Residuals

In [None]:
from statsmodels.stats.diagnostic import normal_ad
    
# Calculating residuals for the Anderson-Darling test
df_results = calculate_residuals(term_model, X, y)
    
# Performing the test on the residuals
p_value = normal_ad(df_results['Residuals'])[1]
print('p-value:', p_value)
    
# Plotting the residuals distribution
plt.subplots(figsize=(12, 6))
plt.title('Distribution of Residuals')
sns.distplot(df_results['Residuals'])
plt.show()


### Assumption 3: Multicollinearity

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Plotting the heatmap
plt.figure(figsize = (10,8))
sns.heatmap(pd.DataFrame(X).corr(), annot=True, cmap='coolwarm')
plt.title('Correlation of Variables')
plt.show()

vif_data = pd.DataFrame()
vif_data["Feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

print(vif_data)


### Assumption 4: Homoscedasticity

In [None]:
df_results = calculate_residuals(term_model, X, y)

# Plotting the residuals
plt.subplots(figsize=(12, 6))
ax = plt.subplot(111)
plt.scatter(x=df_results.index, y=df_results.Residuals, alpha=0.5)
plt.plot(np.repeat(0, df_results.index.max()), color='darkorange', linestyle='--')
ax.spines['right'].set_visible(False)  # Removing the right spine
ax.spines['top'].set_visible(False)  # Removing the top spine
plt.title('Residuals')
plt.show()  

## Preparation for Regression Analysis 2: Predictability of Admissions Characteristics on PANCE Score

### Load Data

In [None]:
pance = pd.read_excel('Applied Stats Final Project/PANCE Scores CO22_23_24.xlsx')

In [None]:
caspa_hist = pd.read_excel('Applied Stats Final Project/CASPA_CO22_23_24_Data.xlsx')

### Merge Data

In [None]:
pmerge = pd.merge(caspa_hist, pance, on='Last Name', how='left')

### Drop NA values, name columns to maintain confidentiality

In [None]:
pmerge = pmerge.dropna()

In [None]:
pmerge.isna().sum()

In [None]:
pmerge = pmerge.drop(columns=['Last Name', 'First Name'])

In [None]:
pmerge

### Check of dtypes and renaming

In [None]:
pmerge.dtypes

In [None]:
pmerge = pmerge.rename(columns={'Score':'PANCE Score'})

In [None]:
pmerge.columns

In [None]:
round(pmerge.describe(), 2)

### Define features for second regression model (un-comment to include/exclude GPA variables)

In [None]:
# #Prerequisite GPA
X = pmerge[['prerequisite_required_coursework_gpa','gpa_overall_science_gpa']]
y = pmerge['PANCE Score']

In [None]:
# #Overall total GPA
# X = pmerge[['overall_total_gpa', 'gpa_overall_science_gpa']]
# y = pmerge['PANCE Score']

In [None]:
# #Prerequisite GPA
var = pmerge[['prerequisite_required_coursework_gpa', 'gpa_overall_science_gpa','PANCE Score']]

In [None]:
# #Overall total GPA
# var = pmerge[['overall_total_gpa', 'gpa_overall_science_gpa','PANCE Score']]

In [None]:
corr = var.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')

### Fit model

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
pance_model = LinearRegression()

In [None]:
pance_model.fit(X, y)

### R Squared Value, Coefficients, Summary

In [None]:
pance_model_r2 = pance_model.score(X, y)
print('R2: {0}'.format(pance_model_r2))

In [None]:
print('Coefficients:', pance_model.coef_)

In [None]:
import statsmodels.api as sm

X_const = sm.add_constant(X)

model_sm = sm.OLS(y, X_const).fit()

print(model_sm.summary())

### Define function for calculating residuals

In [None]:
def calculate_residuals(model, features, label):
  
    predictions = model.predict(features)
    df_results = pd.DataFrame({'Actual': label, 'Predicted': predictions})
    df_results['Residuals'] = abs(df_results['Actual']) - abs(df_results['Predicted'])
    
    return df_results

calculate_residuals(pance_model, X, y)

### Assumption 1: Linearity

In [None]:
# Calculating residuals for the plot
df_results = calculate_residuals(pance_model, X, y)

plt.figure(figsize=(8, 6))  # Set figure size
sns.scatterplot(x=df_results['Actual'], y=df_results['Predicted'], color='dodgerblue', alpha=0.7, edgecolor='black')

# Ideal Regression line
line_coords = np.arange(df_results.min().min(), df_results.max().max())
plt.plot(line_coords, line_coords, color='darkorange', linestyle='--', label='Ideal Fit')

plt.xlim(0, df_results['Actual'].max() + 0.5)
plt.ylim(0, df_results['Predicted'].max() + 0.5)

# Titles & labels
plt.title('Actual vs. Predicted')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.legend()
plt.show()

### Assumption 2: Normality of residuals

In [None]:
from statsmodels.stats.diagnostic import normal_ad
    
# Calculating residuals for the Anderson-Darling test
df_results = calculate_residuals(pance_model, X, y)
    
# Performing the test on the residuals
p_value = normal_ad(df_results['Residuals'])[1]
print('p-value from the test:', p_value)
    
# Plotting the residuals distribution
plt.subplots(figsize=(12, 6))
plt.title('Distribution of Residuals')
sns.distplot(df_results['Residuals'])
plt.show()

### Assumption 3: Multicollinearity

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
# Plotting the heatmap
plt.figure(figsize = (10,8))
sns.heatmap(pd.DataFrame(X).corr(), annot=True, cmap='coolwarm')
plt.title('Correlation of Variables')
plt.show()

In [None]:
vif_data = pd.DataFrame()
vif_data["Feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

In [None]:
print(vif_data)

### Assumption 4: Homoscedasticity

In [None]:
 df_results = calculate_residuals(pance_model, X, y)

# Plotting the residuals
plt.subplots(figsize=(12, 6))
ax = plt.subplot(111)
plt.scatter(x=df_results.index, y=df_results.Residuals, alpha=0.5)
plt.plot(np.repeat(0, df_results.index.max()), color='darkorange', linestyle='--')
ax.spines['right'].set_visible(False)  # Removing the right spine
ax.spines['top'].set_visible(False)  # Removing the top spine
plt.title('Homoscedasticity')
plt.show()  

### Research Question 3: Is First-Term GPA Predictive of PANCE score?

In [None]:
pmerge['CASPA ID'] = pmerge['CASPA ID'].astype(str)
df['cas_id'] = df['cas_id'].astype(str)

fmerge = pd.merge(pmerge, df, left_on='CASPA ID', right_on='cas_id', how='inner')

In [None]:
fmerge.head()

In [None]:
fmerge.isna().sum()

In [None]:
X2 = fmerge[['FirstTermGPA', 'baccalaureate_total_gpa', 'patient_care_experience_experience_hours_total', 'health_related_experience_experience_experience_hours_total', 'prerequisite_required_coursework_gpa_x']]
y2 = fmerge['PANCE Score']

In [None]:
X2_const = sm.add_constant(X2)

model_sm2 = sm.OLS(y2, X2_const).fit()

print(model_sm2.summary())

In [None]:
gpa_model = LinearRegression()
gpa_model.fit(X2, y2)

In [None]:
gpa_model.score(X2, y2)

### Condensed Assumptions

In [None]:
df_results2 = calculate_residuals(gpa_model, X2, y2)


In [None]:
df_results2

In [None]:
sns.scatterplot(x=df_results2['Actual'], y=df_results['Predicted'], color='dodgerblue')

In [None]:
sns.distplot(df_results2['Residuals'])

In [None]:
vif = pd.DataFrame()
vif['Feature'] = X2.columns
vif['VIF'] = [variance_inflation_factor(X2.values, i) for i in range(X2.shape[1])]
vif

In [None]:
plt.scatter(x=df_results2.index, y=df_results2.Residuals, alpha=0.5)