In [1]:
import pandas as pd # type: ignore
import statsmodels.api as sm # type: ignore
from sklearn.model_selection import train_test_split # type: ignore
from sklearn.linear_model import LinearRegression # type: ignore
from sklearn.metrics import mean_squared_error, r2_score # type: ignore
from sklearn.ensemble import RandomForestRegressor # type: ignore
import numpy as np # type: ignore
from sklearn.decomposition import PCA # type: ignore
from sklearn.preprocessing import StandardScaler # type: ignore
import matplotlib.pyplot as plt # type: ignore

In [2]:
def get_df(file_path, sheet_name):
    return pd.read_excel(file_path, sheet_name)

program1_train_df = get_df("Z:/BSA Data/DataSplit/BSA-DataSet_2122_2223-TrainingData.xlsx", "Program1")
program2_train_df = get_df("Z:/BSA Data/DataSplit/BSA-DataSet_2122_2223-TrainingData.xlsx", "Program2")
program3_train_df = get_df("Z:/BSA Data/DataSplit/BSA-DataSet_2122_2223-TrainingData.xlsx", "Program3")
program4_train_df = get_df("Z:/BSA Data/DataSplit/BSA-DataSet_2122_2223-TrainingData.xlsx", "Program4")

program1_test_df = get_df("Z:/BSA Data/DataSplit/BSA-DataSet_2122_2223-TestData.xlsx", "Program1")
program2_test_df = get_df("Z:/BSA Data/DataSplit/BSA-DataSet_2122_2223-TestData.xlsx", "Program2")
program3_test_df = get_df("Z:/BSA Data/DataSplit/BSA-DataSet_2122_2223-TestData.xlsx", "Program3")
program4_test_df = get_df("Z:/BSA Data/DataSplit/BSA-DataSet_2122_2223-TestData.xlsx", "Program4")

print(program1_train_df.columns)

Index(['train', 'Gender', 'Nationality', 'PreEducation', 'Program', 'Year',
       'BSA', 'Credits-Y1', 'Course3-1', 'Course9-1', 'Course8-1', 'Course7-1',
       'Course3-R', 'Course9-R', 'Crd-B1B2', 'Course23', 'Course26', 'Course3',
       'Course16', 'Course8', 'Course9', 'Course22', 'Course7', 'Course25',
       'Course24', 'Course23-R', 'Course26-R', 'Course16-R', 'Course8-R',
       'Course22-R', 'Course7-R', 'Course25-R', 'Course24-R'],
      dtype='object')


In [3]:
# Function to perform the analysis
def program_analysis(train_df, test_df, program_name):
    # Combine dfs
    combined_df = pd.concat([train_df, test_df], keys=['train', 'test'])

    # Get dummies and add personal extra variables
    b1_index = [8,9,12,13]
    b2_index = [10,11]
    combined_df['Crd-B1'] = combined_df.iloc[:, b1_index].sum(axis=1)
    combined_df['Crd-B2'] = combined_df.iloc[:, b2_index].sum(axis=1)
    dummifiable_columns = ['Gender', 'Nationality', 'PreEducation','Year']
    dummies = pd.get_dummies(combined_df[dummifiable_columns], dtype=int)
    combined_df = combined_df.drop(columns=dummifiable_columns)
    combined_df = pd.concat([combined_df, dummies], axis = 1)

    # Split back into original dfs
    train_df = combined_df.xs('train')
    test_df = combined_df.xs('test')

    # Exclude the unnecessary columns
    train_df = train_df.drop(columns=['train', 'Program'])
    test_df = test_df.drop(columns=['train', 'Program'])
    
    # Identify numeric columns only
    numeric_cols = train_df.select_dtypes(include=[float, int]).columns.tolist()
    numeric_cols_2 = test_df.select_dtypes(include=[float, int]).columns.tolist()

    # Prepare the data
    X_train = train_df[numeric_cols].drop(columns=['Credits-Y1'])
    y_train = train_df['Credits-Y1']
    X_test = test_df[numeric_cols_2].drop(columns=['Credits-Y1'])
    y_test = test_df['Credits-Y1']

    X_train = X_train.replace([np.inf, -np.inf], np.nan).fillna(0)
    X_test = X_test.replace([np.inf, -np.inf], np.nan).fillna(0)

    # Correlation Analysis
    corr_matrix = train_df[numeric_cols].corr()
    corr_with_credits = corr_matrix["Credits-Y1"].sort_values(ascending=False)
    print(f"Correlation with Credits-Y1 ({program_name} Training Data):")
    print(corr_with_credits)

    # Regression Analysis with statsmodels
    X_train_sm = sm.add_constant(X_train)
    model_sm = sm.OLS(y_train, X_train_sm).fit()
    print(f"\nRegression Analysis ({program_name} statsmodels):")
    print(model_sm.summary())

In [4]:
programs = [
    ("Program 1", program1_train_df, program1_test_df),
    ("Program 2", program2_train_df, program2_test_df),
    ("Program 3", program3_train_df, program3_test_df),
    ("Program 4", program4_train_df, program4_test_df)
]

for program_name, train_df, test_df in programs:
    program_analysis(train_df, test_df, program_name)

Correlation with Credits-Y1 (Program 1 Training Data):
Credits-Y1                             1.000000
Crd-B1B2                               0.872440
Crd-B2                                 0.821399
Course8                                0.819256
Course22                               0.797311
Course25                               0.770375
Course8-1                              0.756926
Course23                               0.751058
Course9                                0.729035
Course22-R                             0.711987
Course24                               0.706904
Course7                                0.705486
Course9-1                              0.685360
Course25-R                             0.678141
Crd-B1                                 0.676326
Course8-R                              0.661915
Course3-R                              0.657432
Course7-R                              0.635603
Course23-R                             0.620445
Course3                          

In [12]:
def data_prep(train_df, test_df):
    # Combine all training and testing dataframes
    combined_df = pd.concat([train_df, test_df], keys=['training', 'testing'])

    # Create dummies and exclude unnecessary variables
    dummifiable_columns = ['Gender', 'Nationality', 'PreEducation']
    extra_columns = ["BSA", "Year", "Program", "train"]
    dummies = pd.get_dummies(combined_df[dummifiable_columns], dtype=int)
    columns_to_drop = dummifiable_columns + extra_columns
    combined_df = combined_df.drop(columns=columns_to_drop)
    combined_df = pd.concat([combined_df.iloc[:, :1], dummies, combined_df.iloc[:, 1:]], axis = 1)

    # Split back into original dataframes
    train_df = combined_df.xs('training')
    test_df = combined_df.xs('testing')

    # Select relevant columns (up to "Crd-B1B2")
    columns_to_include = train_df.columns[:train_df.columns.tolist().index("Crd-B1B2") + 1]
    train_df = train_df[columns_to_include]
    test_df = test_df[columns_to_include]

    return train_df, test_df


# Function to prepare features of model
def feature_prep(train_df, test_df):
    X_train = train_df.drop(columns=['Credits-Y1'])
    y_train = train_df['Credits-Y1']
    X_test = test_df.drop(columns=['Credits-Y1'])
    y_test = test_df['Credits-Y1']

    X_train = X_train.replace([np.inf, -np.inf], np.nan).fillna(0)
    X_test = X_test.replace([np.inf, -np.inf], np.nan).fillna(0)
    y_train = y_train.replace([np.inf, -np.inf], np.nan).fillna(0)
    y_test = y_test.replace([np.inf, -np.inf], np.nan).fillna(0)

    return X_train, X_test, y_train, y_test


# Function to evaluate model
def evaluate_model(model, X_train, y_train, X_test, y_test):
    if isinstance(X_train, pd.Series):
        X_train = X_train.to_frame()
    if isinstance(X_test, pd.Series):
        X_test = X_test.to_frame()
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    return rmse, r2
    

# Analysis on relevant columns only
def relevant_analysis(train_df, test_df, program_name):
    # Prepare Data for Analysis
    train_df2, test_df2 = data_prep(train_df, test_df)

    # Separate features and target variable
    X_train, X_test, y_train, y_test = feature_prep(train_df2, test_df2)

    # Correlation Analysis
    corr_matrix = train_df2.corr()
    corr_with_credits = corr_matrix["Credits-Y1"].sort_values(ascending=False)
    print(f"Correlation with Credits-Y1 ({program_name} Training Data):")
    print(corr_with_credits)

    # Regression Analysis with statsmodels
    X_train_sm = sm.add_constant(X_train)
    model_sm = sm.OLS(y_train, X_train_sm).fit()
    print(f"\nRegression Analysis ({program_name} statsmodels):")
    print(model_sm.summary())

In [6]:
for program_name, train_df, test_df in programs:
    relevant_analysis(train_df, test_df, program_name)

Correlation with Credits-Y1 (Program 1 Training Data):
Credits-Y1                             1.000000
Crd-B1B2                               0.872440
Course8-1                              0.756926
Course9-1                              0.685360
Course3-R                              0.657432
Course9-R                              0.602428
Course7-1                              0.599292
Course3-1                              0.566055
PreEducation_Buitenlands               0.208107
Nationality_Azie                       0.151701
Nationality_EU                         0.133323
Gender_F                               0.115673
Nationality_Europa                     0.066351
Nationality_Oceanie                    0.046360
Nationality_Onbekend                   0.032768
Nationality_Mid-Zuid-Amerika           0.027756
Nationality_Afrika                     0.011906
PreEducation_Overig                   -0.004636
PreEducation_Hbo                      -0.006320
Nationality_AziÃ«                

In [7]:
def perform_pca_analysis(train_df, test_df, program_name):
    # Prepare Data for Analysis
    train_df2, test_df2 = data_prep(train_df, test_df)

    # Separate features and target variable
    X_train, X_test, y_train, y_test = feature_prep(train_df2, test_df2)

    # Standardize the data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Apply PCA
    pca = PCA()
    pca.fit(X_train_scaled)

    # Explained variance ratio
    explained_variance = pca.explained_variance_ratio_
    print(f"Explained Variance Ratio for {program_name}:")
    print(explained_variance)

    # Determine the number of components to retain (e.g., explain at least 95% variance)
    total_variance = 0
    num_components = 0
    for variance in explained_variance:
        total_variance += variance
        num_components += 1
        if total_variance >= 0.95:
            break

    print(f"Number of components to retain for {program_name}: {num_components}\n")

    # Transform data using the selected number of components
    X_train_pca = pca.transform(X_train_scaled)[:, :num_components]
    X_test_pca = pca.transform(X_test_scaled)[:, :num_components]

    # Create DataFrames with the principal components
    pca_columns = [f'PC{i+1}' for i in range(num_components)]
    X_train_pca_df = pd.DataFrame(X_train_pca, columns=pca_columns)
    X_test_pca_df = pd.DataFrame(X_test_pca, columns=pca_columns)

    # Ensure that y_train and y_test are series
    if isinstance(y_train, pd.DataFrame):
        y_train = y_train.iloc[:, 0]
    if isinstance(y_test, pd.DataFrame):
        y_test = y_test.iloc[:, 0]

    # Add the target variable 'Credits-Y1' back to the DataFrames
    X_train_pca_df['Credits-Y1'] = y_train.reset_index(drop=True)
    X_test_pca_df['Credits-Y1'] = y_test.reset_index(drop=True)

    return X_train_pca_df, X_test_pca_df

In [8]:
pca_results = {}

for program_name, train_df, test_df in programs:
    X_train_pca_df, X_test_pca_df = perform_pca_analysis(train_df, test_df, program_name)
    pca_results[program_name] = {
        'train': X_train_pca_df,
        'test': X_test_pca_df
    }

# Now you can access each program's PCA-transformed dataframes from the `pca_results` dictionary
# Example: Accessing Program 1's transformed data
program1_train_pca_df = pca_results['Program 1']['train']
program1_test_pca_df = pca_results['Program 1']['test']

Explained Variance Ratio for Program 1:
[1.72809030e-01 1.07582734e-01 7.68072310e-02 4.99262669e-02
 4.44369162e-02 4.32307743e-02 4.10515830e-02 3.96553575e-02
 3.90527738e-02 3.88281786e-02 3.87917823e-02 3.86119128e-02
 3.85623665e-02 3.81975102e-02 3.79045678e-02 3.75292942e-02
 3.52367773e-02 3.20892711e-02 1.66772937e-02 1.42790979e-02
 8.25354414e-03 7.11955696e-03 3.36617974e-03 6.41508072e-17
 0.00000000e+00 0.00000000e+00]
Number of components to retain for Program 1: 18

Explained Variance Ratio for Program 2:
[2.06018576e-01 1.48003743e-01 8.68781255e-02 6.77323563e-02
 5.24245452e-02 5.14244903e-02 4.92630964e-02 4.71168344e-02
 4.61120993e-02 4.53953512e-02 4.51843976e-02 4.25551805e-02
 3.85560676e-02 3.04625474e-02 1.29400454e-02 9.60500979e-03
 7.76606628e-03 7.16333043e-03 5.39813745e-03 2.61979234e-16
 8.19961584e-17 9.50610648e-18 0.00000000e+00]
Number of components to retain for Program 2: 14

Explained Variance Ratio for Program 3:
[2.03252390e-01 1.24211320e-01

In [9]:
for program_name in pca_results.keys():
    X_train = pca_results[program_name]["train"].drop(columns = ["Credits-Y1"])
    y_train = pca_results[program_name]["train"]["Credits-Y1"]
    X_test = pca_results[program_name]["test"].drop(columns = ["Credits-Y1"])
    y_test = pca_results[program_name]["test"]["Credits-Y1"]

    # Initialize and train the dataset
    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)

    # Make predictions
    y_pred = lr_model.predict(X_test)

    # Evaluate the model
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    print(f"{program_name}")
    print(f"Root Mean Squared Error: {rmse}")
    print(f"R2 Score: {r2}")
    print(f"\n")

Program 1
Root Mean Squared Error: 10.288856285898163
R2 Score: 0.7990350542805081


Program 2
Root Mean Squared Error: 11.711455737537314
R2 Score: 0.7886054503825132


Program 3
Root Mean Squared Error: 8.959943506086013
R2 Score: 0.8016790431955461


Program 4
Root Mean Squared Error: 8.67238223535148
R2 Score: 0.6795611024837461




In [13]:
forest_results = {}
for program_name, train_df, test_df in programs:
    train_df2, test_df2 = data_prep(train_df, test_df)
    X_train, y_train, X_test, y_test = feature_prep(train_df2, test_df2)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Ensure X_train_scaled and X_test_scaled are dataframes
    X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
    X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

    rf_model = RandomForestRegressor()
    rmse, r2 = evaluate_model(rf_model, X_train_scaled, y_train, X_test_scaled, y_test)
    print(f"{program_name}")
    print(f"Root Mean Squared Error: {rmse}")
    print(f"R2 Score: {r2}")
    print(f"\n")



ValueError: Expected a 2-dimensional container but got <class 'pandas.core.series.Series'> instead. Pass a DataFrame containing a single row (i.e. single sample) or a single column (i.e. single feature) instead.