In [1]:
import pandas as pd # type: ignore
import statsmodels.api as sm # type: ignore
from sklearn.model_selection import train_test_split # type: ignore
from sklearn.linear_model import LinearRegression # type: ignore
from sklearn.metrics import mean_squared_error # type: ignore
from sklearn.ensemble import RandomForestRegressor # type: ignore
import numpy as np # type: ignore

In [2]:
def get_df(file_path, sheet_name):
    return pd.read_excel(file_path, sheet_name)

program1_train_df = get_df("Z:/BSA Data/DataSplit/BSA-DataSet_2122_2223-TrainingData.xlsx", "Program1")
program2_train_df = get_df("Z:/BSA Data/DataSplit/BSA-DataSet_2122_2223-TrainingData.xlsx", "Program2")
program3_train_df = get_df("Z:/BSA Data/DataSplit/BSA-DataSet_2122_2223-TrainingData.xlsx", "Program3")
program4_train_df = get_df("Z:/BSA Data/DataSplit/BSA-DataSet_2122_2223-TrainingData.xlsx", "Program4")

program1_test_df = get_df("Z:/BSA Data/DataSplit/BSA-DataSet_2122_2223-TestData.xlsx", "Program1")
program2_test_df = get_df("Z:/BSA Data/DataSplit/BSA-DataSet_2122_2223-TestData.xlsx", "Program2")
program3_test_df = get_df("Z:/BSA Data/DataSplit/BSA-DataSet_2122_2223-TestData.xlsx", "Program3")
program4_test_df = get_df("Z:/BSA Data/DataSplit/BSA-DataSet_2122_2223-TestData.xlsx", "Program4")

print(program1_train_df.columns)

Index(['train', 'Gender', 'Nationality', 'PreEducation', 'Program', 'Year',
       'BSA', 'Credits-Y1', 'Course3-1', 'Course9-1', 'Course8-1', 'Course7-1',
       'Course3-R', 'Course9-R', 'Crd-B1B2', 'Course23', 'Course26', 'Course3',
       'Course16', 'Course8', 'Course9', 'Course22', 'Course7', 'Course25',
       'Course24', 'Course23-R', 'Course26-R', 'Course16-R', 'Course8-R',
       'Course22-R', 'Course7-R', 'Course25-R', 'Course24-R'],
      dtype='object')


In [5]:
# Function to perform the analysis
def program_analysis(train_df, test_df, program_name):
    # Combine dfs
    combined_df = pd.concat([train_df, test_df], keys=['train', 'test'])

    # Get dummies and add personal extra variables
    b1_index = [8,9,12,13]
    b2_index = [10,11]
    combined_df['Crd-B1'] = combined_df.iloc[:, b1_index].sum(axis=1)
    combined_df['Crd-B2'] = combined_df.iloc[:, b2_index].sum(axis=1)
    combined_df = pd.get_dummies(combined_df, columns=['Gender', 'Nationality', 'PreEducation','Year'], dtype=int)

    # Split back into original dfs
    train_df = combined_df.xs('train')
    test_df = combined_df.xs('test')

    # Exclude the unnecessary columns
    train_df = train_df.drop(columns=['train', 'Program'])
    test_df = test_df.drop(columns=['train', 'Program'])
    
    # Identify numeric columns only
    numeric_cols = train_df.select_dtypes(include=[float, int]).columns.tolist()
    numeric_cols_2 = test_df.select_dtypes(include=[float, int]).columns.tolist()

    # Prepare the data
    X_train = train_df[numeric_cols].drop(columns=['Credits-Y1'])
    y_train = train_df['Credits-Y1']
    X_test = test_df[numeric_cols_2].drop(columns=['Credits-Y1'])
    y_test = test_df['Credits-Y1']


    X_train = X_train.replace([np.inf, -np.inf], np.nan).fillna(0)
    X_test = X_test.replace([np.inf, -np.inf], np.nan).fillna(0)

    # Correlation Analysis
    corr_matrix = train_df[numeric_cols].corr()
    corr_with_credits = corr_matrix["Credits-Y1"].sort_values(ascending=False)
    print(f"Correlation with Credits-Y1 ({program_name} Training Data):")
    print(corr_with_credits)

    # Regression Analysis with statsmodels
    X_train_sm = sm.add_constant(X_train)
    model_sm = sm.OLS(y_train, X_train_sm).fit()
    print(f"\nRegression Analysis ({program_name} statsmodels):")
    print(model_sm.summary())

In [6]:
programs = [
    ("Program 1", program1_train_df, program1_test_df),
    ("Program 2", program2_train_df, program2_test_df),
    ("Program 3", program3_train_df, program3_test_df),
    ("Program 4", program4_train_df, program4_test_df)
]

for program_name, train_df, test_df in programs:
    program_analysis(train_df, test_df, program_name)

Correlation with Credits-Y1 (Program 1 Training Data):
Credits-Y1                             1.000000
Crd-B1B2                               0.872440
Crd-B2                                 0.821399
Course8                                0.819256
Course22                               0.797311
Course25                               0.770375
Course8-1                              0.756926
Course23                               0.751058
Course9                                0.729035
Course22-R                             0.711987
Course24                               0.706904
Course7                                0.705486
Course9-1                              0.685360
Course25-R                             0.678141
Crd-B1                                 0.676326
Course8-R                              0.661915
Course3-R                              0.657432
Course7-R                              0.635603
Course23-R                             0.620445
Course3                          