In [1]:
import pandas as pd
import numpy as np

In [2]:
import os

filename = 'dataSet.csv'
if not os.path.exists(filename):
    from google.colab import files
    print(f"{filename} not found: please upload it now.")
    uploaded = files.upload()  # This will prompt upload if needed

df = pd.read_csv(filename)
print(df.head())


dataSet.csv not found: please upload it now.


Saving dataSet.csv to dataSet.csv
             Timestamp Year of Study Your Average Lecture attendance Rate  \
0  12/07/2025 16:50:00      3rd Year                            75% - 90%   
1  12/07/2025 16:50:29      3rd Year                            Above 90%   
2  12/07/2025 16:51:44      3rd Year                            Below 50%   
3  12/07/2025 16:51:49      3rd Year                            50% - 74%   
4  12/07/2025 16:55:43      2nd Year                            Above 90%   

  Recently Released Semester Results [Subject 1]  \
0                                             B+   
1                                             A-   
2                                             C-   
3                                              A   
4                                             B-   

  Recently Released Semester Results [Subject 2]  \
0                                             B+   
1                                             B+   
2                                 

In [3]:
# Drop unnecessary columns: Timestamp and extra subject columns (Subject 5 to 8)
columns_to_drop = [
    'Timestamp',
    'If you had more than four subjects [Subject 5]',
    'If you had more than four subjects [Subject 6]',
    'If you had more than four subjects [Subject 7]',
    'If you had more than four subjects [Subject 8]'
]
df = df.drop(columns=columns_to_drop)
print("Columns after drop:\n", df.columns)

Columns after drop:
 Index(['Year of Study', 'Your Average Lecture attendance Rate',
       'Recently Released Semester Results [Subject 1]',
       'Recently Released Semester Results [Subject 2]',
       'Recently Released Semester Results [Subject 3]',
       'Recently Released Semester Results [Subject 4]',
       'What is your current GPA ? (If you don't know, you can leave this)',
       'How many hours do you study per week?',
       'How Often do You Complete Assignments on Time',
       'Are you currently doing a part time job?',
       'If yes, how many hours per week do you work?',
       'Are you involved in any sports or extracurricular activities(clubs, societies, volunteering, etc.)?',
       'If yes, how many hours per week do you spend on sports  or extracurricular activities  ?'],
      dtype='object')


In [4]:
# Handle and encode GPA column

gpa_col = "What is your current GPA ? (If you don't know, you can leave this)"
df[gpa_col] = pd.to_numeric(df[gpa_col], errors='coerce')                 # convert to numeric
df[gpa_col] = df[gpa_col].fillna(df[gpa_col].mean())                      # fill missing with mean

In [5]:
# Drop rows with missing critical fields: Attendance and Subject 1 to 4 grades

subject_cols = [
    'Recently Released Semester Results [Subject 1]',
    'Recently Released Semester Results [Subject 2]',
    'Recently Released Semester Results [Subject 3]',
    'Recently Released Semester Results [Subject 4]'
]

df = df.dropna(subset=['Your Average Lecture attendance Rate'] + subject_cols)
print(f"Dataset shape after dropping rows with missing attendance or subject grades: {df.shape}")


Dataset shape after dropping rows with missing attendance or subject grades: (108, 13)


In [6]:
# Encode Categorical Variables

# Encode Attendance Rate
attendance_map = {
    'Above 90%': 0.95,
    '75% - 90%': 0.825,
    '50% - 74%': 0.62,
    'Below 50%': 0.45
}
df['Attendance_Encoded'] = df['Your Average Lecture attendance Rate'].map(attendance_map)

# Encode Study Hours per week
study_hours_map = {
    'Less than 10 hours': 5,
    '10 hours - 20 hours': 15,
    'More than 20 hours': 25
}
df['StudyHours_Encoded'] = df['How many hours do you study per week?'].map(study_hours_map)

# Encode Assignment Completion Frequency
assignments_map = {
    'Always': 3,
    'Often': 2,
    'Sometimes': 1,
    'Rarely': 0
}
df['Assignments_Encoded'] = df['How Often do You Complete Assignments on Time'].map(assignments_map)

# Encode Part-Time Job and Extracurricular activities
df['PartTimeJob_Encoded'] = df['Are you currently doing a part time job?'].map({'Yes': 1, 'No': 0})
df['Extracurriculars_Encoded'] = df['Are you involved in any sports or extracurricular activities(clubs, societies, volunteering, etc.)?'].map({
    'Yes': 1,
    'No': 0,
    'Occasionally': 0.5
})


# Encode Grades for Subjects 1 to 4
grade_map = {
    'A+': 4.0, 'A': 4.0, 'A-': 3.7,
    'B+': 3.3, 'B': 3.0, 'B-': 2.7,
    'C+': 2.3, 'C': 2.0, 'C-': 1.7,
    'D+': 1.3, 'D': 1.0, 'E': 0.0,
    'Absent': np.nan, 'Ineligible': np.nan
}

for subj in subject_cols:
    encoded_col = subj.replace("Recently Released Semester Results ", "") + "_Encoded"
    df[encoded_col] = df[subj].map(grade_map)



In [7]:
# Normalizing GPA

from sklearn.preprocessing import MinMaxScaler

gpa_col = "What is your current GPA ? (If you don't know, you can leave this)"
df[gpa_col] = pd.to_numeric(df[gpa_col], errors='coerce')
df[gpa_col] = df[gpa_col].fillna(df[gpa_col].mean())
scaler = MinMaxScaler()
df['GPA_Normalized'] = scaler.fit_transform(df[[gpa_col]])


In [8]:
from sklearn.preprocessing import MinMaxScaler

# Fit the scaler on raw GPA (not the normalized one)
gpa_scaler = MinMaxScaler()
df['GPA_Normalized'] = gpa_scaler.fit_transform(df[[gpa_col]])


In [9]:
# Create Target Variable as Average Grade of Subjects 1 to 4

encoded_subject_cols = [col.replace("Recently Released Semester Results ", "") + "_Encoded" for col in subject_cols]

df['Average_Grade'] = df[encoded_subject_cols].mean(axis=1)

# Drop rows where Average_Grade is NaN (if any subject grade was missing/unmapped)
df = df.dropna(subset=['Average_Grade'])
print(f"Data shape after dropping missing Average_Grade: {df.shape}")


Data shape after dropping missing Average_Grade: (108, 24)


In [10]:
# Prepare Feature Matrix (X) and Target Vector (y)

features = [
    'Attendance_Encoded',
    'StudyHours_Encoded',
    'Assignments_Encoded',
    'PartTimeJob_Encoded',
    'Extracurriculars_Encoded',
    'GPA_Normalized'
]

target = 'Average_Grade'

X = df[features].values
y = df[target].values

print(f"Feature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")

Feature matrix shape: (108, 6)
Target vector shape: (108,)


In [11]:
# Normalizing Features

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [12]:
columns_to_save = features + [target]
df[columns_to_save].to_csv('preprocessed_model_data.csv', index=False)

# from google.colab import files

files.download('preprocessed_model_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>