In [None]:
#import pandas etc
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 

In [None]:
#load data

file_path = '...'
sheet_name = 2  # Specify the sheet name here
#0 for program 1, 1 for program 2 etc
df = pd.read_excel(file_path, sheet_name=sheet_name)


def extract_course_columns(df):
    course_columns = [col for col in df.columns if col.startswith('Course')]
    first_try_columns = [col for col in course_columns if col.endswith('-1')]
    resit_columns = [col for col in course_columns if col.endswith('-R')]
    final_grade_columns = [col for col in course_columns if not (col.endswith('-1') or col.endswith('-R'))]

    return first_try_columns, resit_columns, final_grade_columns

# Extract course columns
first_try_columns, resit_columns, final_grade_columns = extract_course_columns(df)

print("First Try Columns:", first_try_columns)
print("Resit Columns:", resit_columns)
print("Final Grade Columns:", final_grade_columns)



In [None]:
# Create resits_needed column and initialize with zeros
df['resits_needed'] = 0

# Loop through each column in first_try_columns
for col in first_try_columns:
    # Increment resits_needed by 1 if the column value is less than 5.5 or missing
    df.loc[(df[col] < 5.5) | (df[col].isna()), 'resits_needed'] += 1


In [None]:
# Initialize an empty list to store column names
block1and2_results = []

# Iterate through each column in first_try_columns
for col in first_try_columns:
    # Extract the course number X from the column name
    course_num = col.split('-')[0]

    # Add the initial attempt column (CourseX-1)
    block1and2_results.append(col)

    # Check if there's a corresponding resit column (CourseX-R)
    resit_col = f"{course_num}-R"
    if resit_col in resit_columns:
        # Add the resit column to the list
        block1and2_results.append(resit_col)

# Print the resulting list of column names
print("block1and2_results:", block1and2_results)

In [None]:
# Initialize the 'absent' column with zeros
df['absent'] = 0

# Iterate through each column in block1and2_results
for col in block1and2_results:
    # Extract the course number X from the column name
    course_num = col.split('-')[0]

    # Check if it's an initial attempt column
    if col.endswith('-1'):
        # Increment 'absent' by 1 if the initial attempt column is missing
        df.loc[df[col].isna(), 'absent'] += 1
    elif col.endswith('-R'):
        # Check if there's a corresponding initial attempt column and increment 'absent' accordingly
        initial_col = f"{course_num}-1"
        df.loc[(df[col].isna()) & ((df[initial_col] < 5.5) | df[initial_col].isna()), 'absent'] += 1



In [None]:
column_names = block1and2_results
means = {}
for column in column_names:
    means[column] = df[column].mean()

mean_programB1B2 = sum(means.values())/ len(means)
mean_programB1B2 = round(mean_programB1B2, 2)

df['average_gradeB1B2'] = df[block1and2_results].mean(axis=1)
df['average_gradeB1B2'] = round(df['average_gradeB1B2'], 2)

df['deviationB1B2'] = df['average_gradeB1B2'] - mean_programB1B2
df['deviationB1B2'] = round(df['deviationB1B2'], 2)

for column, mean in means.items():
    print(f'Mean for {column}: {round(mean, 2)}')

print('Mean for Program B1B2:', mean_programB1B2)

In [None]:
# Create 'Dutch' column with True/False values based on the condition
df['Dutch'] = (df['Nationality'] == 'Nederland')


df['Dutch'] = df['Dutch'].astype(int)

# Create 'Dutch' column with True/False values based on the condition
df['Non-Dutch'] = (df['Nationality'] != 'Nederland')


df['Non-Dutch'] = df['Non-Dutch'].astype(int)

In [None]:
# Iterate through block1and2_results to create passed columns
for col in block1and2_results:
    # Extract the course number and attempt type from the column name
    course_num, attempt_type = col.split('-')

    # Check if it's an initial attempt column
    if attempt_type == '1':
        # Create the passed column if it doesn't exist
        if f'passed{course_num}' not in df.columns:
            df[f'passed{course_num}'] = 0

        # Set passed column to 1 if the grade is greater than or equal to 5.5
        df.loc[df[col] >= 5.5, f'passed{course_num}'] = 1

    # Check if it's a resit attempt column
    elif attempt_type == 'R':
        # Set passed column to 1 if the resit attempt is greater than or equal to 5.5
        df.loc[df[col] >= 5.5, f'passed{course_num}'] = 1

        # Set passed column to 1 if the resit attempt is missing and the initial attempt is greater than or equal to 5.5
        initial_col = next((c for c in block1and2_results if c.startswith(f'Course{course_num}-1')), None)
        if initial_col:
            df.loc[(df[col].isna()) & (df[initial_col] >= 5.5), f'passed{course_num}'] = 1



In [None]:
# Initialize the YEAR column with zeros
df['YEAR'] = 0

# Loop through each column specified in the final_grade_columns list
for var in final_grade_columns:
    df['YEAR'] += df[var].apply(lambda x: 6 if x >= 5.5 and not pd.isnull(x) else 0)


In [None]:
# Generate the passed42 column and convert boolean to integer
df['passed42'] = (df['YEAR'] >= 42).astype(int)
df['passed36'] = (df['YEAR'] >= 36).astype(int)
df['passed48'] = (df['YEAR'] >= 48).astype(int)


df = pd.get_dummies(df, columns=['Gender', 'PreEducation'], dtype=int)


non_categorical_columns = [col for col in df.columns if col not in ['Gender', 'Nationality', 'PreEducation', 'Program', 'Year', 'BSA']]
df[non_categorical_columns] = df[non_categorical_columns].fillna(0)

In [None]:
#drop columns not in blocks 1 and 2
columns_to_drop = [ 'Credits-Y1', 'BSA','Program', 'YEAR', 'Year', 'Nationality']



# Drop the specified columns
df.drop(columns=columns_to_drop, inplace=True)

In [None]:
# Drop the columns specified in final_grade_columns
df.drop(columns=final_grade_columns, inplace=True)

In [None]:
#load train and test data
train_file_path = '...'
test_file_path = '...'

train_df = pd.read_excel(train_file_path, sheet_name=sheet_name)
test_df = pd.read_excel(test_file_path, sheet_name=sheet_name)


train_indices = df["train"].isin(train_df["train"])
test_indices = df["train"].isin(test_df["train"])

X = df.drop(columns = ['passed36','passed42','passed48'])
y = df['passed42']

X_train = X.loc[train_indices].drop(columns=["train"])
y_train = y.loc[train_indices]

X_test = X.loc[test_indices].drop(columns=["train"])
y_test = y.loc[test_indices]

In [None]:
# Create a baseline Random Forest Classifier
baseline_rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the baseline model
baseline_rf_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred_baseline = baseline_rf_classifier.predict(X_test)

# Evaluate the baseline model
accuracy_baseline = accuracy_score(y_test, y_pred_baseline)
precision_baseline = precision_score(y_test, y_pred_baseline, average='binary')
recall_baseline = recall_score(y_test, y_pred_baseline, average='binary')
f1_baseline = f1_score(y_test, y_pred_baseline, average='binary')

# Print the baseline metrics
print('Baseline Model Performance:')
print(f'Accuracy: {accuracy_baseline}')
print(f'Precision: {precision_baseline}')
print(f'Recall: {recall_baseline}')
print(f'F1 Score: {f1_baseline}')
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred_baseline))
print('Classification Report:')
print(classification_report(y_test, y_pred_baseline))

In [None]:
# Define the parameter grid for Grid Search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters found by Grid Search
print(f'Best parameters found: {grid_search.best_params_}')


In [None]:
# Create the model with the best parameters found by Grid Search
best_rf_classifier = RandomForestClassifier(
    max_depth=grid_search.best_params_['max_depth'],
    max_features=grid_search.best_params_['max_features'],
    min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
    min_samples_split=grid_search.best_params_['min_samples_split'],
    n_estimators=grid_search.best_params_['n_estimators'],
    random_state=42
)

# Train the model with the best parameters
best_rf_classifier.fit(X_train, y_train)

# Make predictions on the test set with the best model
y_pred_best = best_rf_classifier.predict(X_test)

# Evaluate the model with the best parameters
accuracy_best = accuracy_score(y_test, y_pred_best)
precision_best = precision_score(y_test, y_pred_best, average='binary')
recall_best = recall_score(y_test, y_pred_best, average='binary')
f1_best = f1_score(y_test, y_pred_best, average='binary')

# Print the metrics for the best model
print('Best Model Performance:')
print(f'Accuracy: {accuracy_best}')
print(f'Precision: {precision_best}')
print(f'Recall: {recall_best}')
print(f'F1 Score: {f1_best}')
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred_best))
print('Classification Report:')
print(classification_report(y_test, y_pred_best))

In [None]:
# Print the predictions made by the best model
print("Best Model Predictions:")
print(y_pred_best)


In [None]:
# Get the feature importances of the best model
feature_importances_best = best_rf_classifier.feature_importances_


feature_names = X_test.columns

# Create a DataFrame for better visualization
importance_df_best = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances_best
})

# Sort the DataFrame by importance
importance_df_best = importance_df_best.sort_values(by='Importance', ascending=False)

# Display the DataFrame
print('Feature Importances of the Best Model:')
print(importance_df_best)


In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df_best)
plt.title('Feature Importances of the Best Model')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

In [None]:
correlation_matrix = X_train.corr()

print(correlation_matrix)

In [None]:
# Calculate the confusion matrix
cm = confusion_matrix(y_test, y_pred_best)

# Plot the confusion matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g', cbar=False)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()


In [None]:
total_score = accuracy_best + recall_best + precision_best + f1_best
print("Total score", round(total_score, 2))