In [None]:
#an attempt at hyperparameter tuning with RNNs

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import SimpleRNN, Dense, Dropout, GRU
from sklearn.model_selection import GridSearchCV
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import GRU, Dense, Dropout, Flatten
from keras.optimizers import Adam
from keras.regularizers import l2
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, multilabel_confusion_matrix, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array


In [2]:

class dataprep:

    def load_data(self):
        #loading the relevant csv files:
        student_info = pd.read_csv("E:\\DL project\\anonymisedData\\studentInfo.csv")
        columns_to_remove = ['gender', 'region', 'highest_education', 'imd_band','age_band','studied_credits' ]
        student_info = student_info.drop(columns = columns_to_remove)
        #student_info.head()

        #loading assessment scores
        student_assessment = pd.read_csv("E:\\DL project\\anonymisedData\\studentAssessment.csv")
        student_assessment = student_assessment.drop(columns = 'is_banked')
        #student_assessment.head()

        #loading assessments
        assessments = pd.read_csv("E:\\DL project\\anonymisedData\\assessments.csv")
        #assessment.head()

        return student_assessment, student_info, assessments


    #produces course-wise dataframes for each student's evaluations
    def sort_data(self, course):
        student_assessment, student_info, assessments = self.load_data()

        df = pd.DataFrame()
        df['id_student'] = student_info['id_student']
        df['disability'] = student_info['disability']

        filtered_course_result = student_info[student_info['code_module']== course]
        results = filtered_course_result['final_result']
        df['final_results'] = results

        filtered_assessments = assessments[assessments['code_module'] == course]
        list_assessments = filtered_assessments['id_assessment'].unique().tolist()
        #print(list_assessments)

        filtered_assessments = assessments[assessments['code_module'] == course]
        list_presentations = filtered_assessments['code_presentation'].unique().tolist()
        #print(list_presentations)

        #drop everything but this course
        filtered_st_assessments = student_assessment[student_assessment['id_assessment'].isin(list_assessments)]

        pivoted_st_assessments = filtered_st_assessments.pivot(index='id_student', columns='id_assessment', values='score').reset_index()
        pivoted_st_assessments.fillna(0, inplace=True)

        pivoted_st_assessments.head(200)

        df = pd.merge(df, pivoted_st_assessments, on='id_student', how='left')
        df.dropna(subset=['final_results'], inplace=True)

        df.fillna(0, inplace = True)

        #print(df.head())

        return df, list_assessments, list_presentations
    
    
    def update_assessment_list(self, df):
        column_names = df.columns.tolist()
        assessment_cols =  column_names[3:]

        return assessment_cols


    #as the name implies, it calculates the relevant averages for each course and puts them in a dataframe for the final score and score classes
    #def calculate_totals(self, df):
    
    def calculate_scores(self, course ):
        df, list_assess, list_pres = self.sort_data(course)
        assess_cols= self.update_assessment_list(df)

        #calculate average scores for each student

        df['average_score'] = df[assess_cols].mean(axis=1)
        final_results_scores = {"Distinction":2,"Pass": 1, "Fail": -1, "Withdrawn": 0}
        df['final_results'] = df['final_results'].map(final_results_scores)
        disability_status = {"Y":1 ,"N":0}
        df['disability'] = df['disability'].map(disability_status)

        df['total_score'] = (0.5 * df['final_results']) + (0.3 * df['disability']) + (0.2 * df['average_score'])

        #diving data into classes
        new_score_ranges = [(-1, 2), (3, 7), (8, 12), (13, 17), (18, 21)]  
        class_labels = [1, 2, 3, 4, 5]
        df['score_class'] = pd.cut(df['total_score'], bins=[range[0] for range in new_score_ranges] + [max(new_score_ranges[-1])], labels= class_labels)

        #dropping irrelevant columns
        first_three_cols = df.iloc[:, :3]

        # Get the last three columns
        last_three_cols = df.iloc[:, -3:]

        # Create a new DataFrame with the selected columns
        new_df = pd.concat([first_three_cols, last_three_cols], axis=1)

        return new_df

        #print(new_df)


    def create_final_dataset(self):
        #loading courses
        courses = pd.read_csv("E:\\DL project\\anonymisedData\\courses.csv")
        courses = courses.drop(columns = 'module_presentation_length')

        unique_course_modules = courses['code_module'].unique()

        final_df = pd.DataFrame()


        for course_module in unique_course_modules:
            final_df = pd.concat([final_df, self.calculate_scores(course_module)], ignore_index=True)

        #print(final_df)

        return final_df
    
    
    
    def visualize_data(self):
        df = self.create_final_dataset()

        class_labels = [1,2,3,4,5]

        #class wise visualization
        plt.figure(figsize=(12, 8))
        sns.boxplot(x='score_class', y='average_score', data=df, order=class_labels)
        plt.title('Distribution of Average Scores in Each Class')
        plt.xlabel('Score Class')
        plt.ylabel('Average Score')
        plt.show()

        #histogram
        plt.figure(figsize=(10, 6))
        plt.hist(df['total_score'], bins=30, color='skyblue', edgecolor='black')
        plt.title('Distribution of Final Scores')
        plt.xlabel('Final Score')
        plt.ylabel('Frequency')
        plt.show()

        #numeric class distribution
        plt.figure(figsize=(10, 6))
        plt.hist(df['total_score'], bins=5, color='lightcoral', edgecolor='black')
        plt.title('Distribution of Final Scores (Numeric Classes)')
        plt.xlabel('Numeric Class')
        plt.ylabel('Frequency')
        plt.show()


In [3]:
data_p = dataprep()
data = data_p.create_final_dataset()
#data_p.visualize_data()


In [4]:
def prepare_sequences(d, time_steps):
    sequences = []
    labels = []

    for i in range(len(d) - time_steps):
        seq = data.iloc[i:i+time_steps][['disability', 'final_results', 'average_score', 'total_score']].values
        label = data.iloc[i+time_steps]['score_class']
        sequences.append(seq)
        labels.append(label)

    return np.array(sequences), np.array(labels)

time_steps = 7
sequences, labels = prepare_sequences(data, time_steps)

# Split sequences and labels into training and testing sets
X_train_seq, X_test_seq, y_train_seq, y_test_seq = train_test_split(sequences, labels, test_size=0.2, random_state=42)

        

In [5]:
#one-hot encoding for the labels
y_train_seq = to_categorical(y_train_seq, num_classes=5)
y_test_seq = to_categorical(y_test_seq, num_classes=5)


print("Shape of y_train_seq:", y_train_seq.shape)
print("Shape of y_train_seq:", y_test_seq.shape)

Shape of y_train_seq: (26068, 5)
Shape of y_train_seq: (6518, 5)


In [6]:


class KerasRNNClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, units=50, dropout_rate=0.2, optimizer='adam', epochs=200, batch_size=32):
        self.units = units
        self.dropout_rate = dropout_rate
        self.optimizer = optimizer
        self.epochs = epochs
        self.batch_size = batch_size

    def fit(self, X, y):
        X, y = check_X_y(X, y, multi_output=True)
        
        self.model = Sequential()
        self.model.add(SimpleRNN(units=self.units, input_shape=(X.shape[1], X.shape[2])))
        self.model.add(Dropout(self.dropout_rate))
        self.model.add(Dense(units=5, activation='softmax'))
        
        self.model.compile(optimizer=self.optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
        
        self.model.fit(X, y, epochs=self.epochs, batch_size=self.batch_size, verbose=0)
        return self

    def predict(self, X):
        X = check_array(X)
        y_pred = self.model.predict(X)
        return np.argmax(y_pred, axis=1)

# Defining the hyperparameter grid
param_grid = {
    'units': [50, 64, 128],
    'dropout_rate': [0.2, 0.3, 0.4],
    'optimizer': ['adam', 'rmsprop']
}

rnn_model = KerasRNNClassifier(epochs=200, batch_size=32)
grid_search = GridSearchCV(estimator=rnn_model, param_grid=param_grid, cv=3, scoring='accuracy', verbose=1)

grid_result = grid_search.fit(X_train_seq, y_train_seq)

print("Best Parameters: ", grid_result.best_params_)
print("Best Accuracy: ", grid_result.best_score_)


Fitting 3 folds for each of 18 candidates, totalling 54 fits


ValueError: 
All the 54 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
54 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\HP\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\HP\AppData\Local\Temp\ipykernel_12084\595344198.py", line 18, in fit
    X, y = check_X_y(X, y, multi_output=True)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\HP\anaconda3\Lib\site-packages\sklearn\utils\validation.py", line 1146, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "c:\Users\HP\anaconda3\Lib\site-packages\sklearn\utils\validation.py", line 951, in check_array
    raise ValueError(
ValueError: Found array with dim 3. None expected <= 2.
