In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import mutual_info_score
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('jamb_exam_results.csv')
df.head()

Unnamed: 0,JAMB_Score,Study_Hours_Per_Week,Attendance_Rate,Teacher_Quality,Distance_To_School,School_Type,School_Location,Extra_Tutorials,Access_To_Learning_Materials,Parent_Involvement,IT_Knowledge,Student_ID,Age,Gender,Socioeconomic_Status,Parent_Education_Level,Assignments_Completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,1,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,2,15,Male,High,,1
2,182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,3,20,Female,High,Tertiary,2
3,210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,4,22,Female,Medium,Tertiary,1
4,199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,5,22,Female,Medium,Tertiary,1


In [4]:
df.isnull().sum()

JAMB_Score                        0
Study_Hours_Per_Week              0
Attendance_Rate                   0
Teacher_Quality                   0
Distance_To_School                0
School_Type                       0
School_Location                   0
Extra_Tutorials                   0
Access_To_Learning_Materials      0
Parent_Involvement                0
IT_Knowledge                      0
Student_ID                        0
Age                               0
Gender                            0
Socioeconomic_Status              0
Parent_Education_Level          891
Assignments_Completed             0
dtype: int64

In [5]:
df.Parent_Education_Level.head()

0    Tertiary
1         NaN
2    Tertiary
3    Tertiary
4    Tertiary
Name: Parent_Education_Level, dtype: object

In [6]:
df.groupby(df["JAMB_Score"])['Study_Hours_Per_Week'].mean()

JAMB_Score
100    12.046512
101    14.097561
102    12.825000
103    15.864865
104    16.300000
         ...    
359    37.000000
360    33.000000
362    40.000000
366    40.000000
367    34.333333
Name: Study_Hours_Per_Week, Length: 220, dtype: float64

We can see a positive correlation between Jamb score and study hours per week, so we can assume it describes a high degree of the variance

In [7]:
df2 = df[['JAMB_Score','Study_Hours_Per_Week', 'Attendance_Rate', 'Teacher_Quality', 'Distance_To_School', 'Student_ID', 'Age', 'Assignments_Completed' ]]
df2.head()

Unnamed: 0,JAMB_Score,Study_Hours_Per_Week,Attendance_Rate,Teacher_Quality,Distance_To_School,Student_ID,Age,Assignments_Completed
0,192,22,78,4,12.4,1,17,2
1,207,14,88,4,2.7,2,15,1
2,182,29,87,2,9.6,3,20,2
3,210,29,99,2,2.6,4,22,1
4,199,12,98,3,8.8,5,22,1


In [8]:
lls = []

def find_strings_in_df(df):
    """Return list of (column, index, value) for all string values in DataFrame."""
    results = []
    for col in df.columns:
        for idx, val in df[col].items():
            if isinstance(val, str):
                results.append((col, idx, val))
    return results


def find_strings(series):
    """When used with DataFrame.apply (axis=0), collects string values from a Series.

    - Returns a list of (column_name, index, value) found in this Series.
    - Also appends those occurrences to the global `lls` list so existing code that
      expects `lls` will keep working.
    """
    found = [(series.name, idx, val) for idx, val in series.items() if isinstance(val, str)]
    if found:
        lls.extend(found)
    return found

In [9]:
df2.corr()

Unnamed: 0,JAMB_Score,Study_Hours_Per_Week,Attendance_Rate,Teacher_Quality,Distance_To_School,Student_ID,Age,Assignments_Completed
JAMB_Score,1.0,0.420061,0.280953,0.299505,-0.094142,0.014646,-0.028432,0.278079
Study_Hours_Per_Week,0.420061,1.0,0.129738,0.120297,-0.022266,-0.00642,-0.024121,0.618464
Attendance_Rate,0.280953,0.129738,1.0,0.080537,-0.057385,-0.000136,0.008382,0.08749
Teacher_Quality,0.299505,0.120297,0.080537,1.0,0.005902,-0.018112,-0.022594,0.068621
Distance_To_School,-0.094142,-0.022266,-0.057385,0.005902,1.0,-0.019205,0.001874,-0.017597
Student_ID,0.014646,-0.00642,-0.000136,-0.018112,-0.019205,1.0,0.015351,-0.015544
Age,-0.028432,-0.024121,0.008382,-0.022594,0.001874,0.015351,1.0,-0.000127
Assignments_Completed,0.278079,0.618464,0.08749,0.068621,-0.017597,-0.015544,-0.000127,1.0


In [10]:
#feature engineering and extraction
#standard scaler

In [11]:
df.head()

Unnamed: 0,JAMB_Score,Study_Hours_Per_Week,Attendance_Rate,Teacher_Quality,Distance_To_School,School_Type,School_Location,Extra_Tutorials,Access_To_Learning_Materials,Parent_Involvement,IT_Knowledge,Student_ID,Age,Gender,Socioeconomic_Status,Parent_Education_Level,Assignments_Completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,1,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,2,15,Male,High,,1
2,182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,3,20,Female,High,Tertiary,2
3,210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,4,22,Female,Medium,Tertiary,1
4,199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,5,22,Female,Medium,Tertiary,1


In [12]:
df.drop(columns=["Student_ID"], inplace=True)
# It has little to no effect on the Jamb_score with a linear correlation of only 0.001

In [13]:
target = df["JAMB_Score"]
target

0       192
1       207
2       182
3       210
4       199
       ... 
4995    183
4996    179
4997    261
4998    183
4999    218
Name: JAMB_Score, Length: 5000, dtype: int64

In [14]:
for item in df.columns:
    print(f"{item}, {len(df[item].unique())}")

JAMB_Score, 220
Study_Hours_Per_Week, 41
Attendance_Rate, 51
Teacher_Quality, 5
Distance_To_School, 201
School_Type, 2
School_Location, 2
Extra_Tutorials, 2
Access_To_Learning_Materials, 2
Parent_Involvement, 3
IT_Knowledge, 3
Age, 8
Gender, 2
Socioeconomic_Status, 3
Parent_Education_Level, 4
Assignments_Completed, 5


In [15]:
ls = []
for item in df["Parent_Education_Level"].unique():
    i = 0
    for items in df["Parent_Education_Level"]:
        if items == item:
            i+=1
    ls.append(f"{item, i}")
ls

["('Tertiary', 1218)", '(nan, 0)', "('Primary', 1335)", "('Secondary', 1556)"]

In [16]:
df.groupby("Parent_Education_Level")["JAMB_Score"].mean()

Parent_Education_Level
Primary      169.112360
Secondary    176.573265
Tertiary     184.738095
Name: JAMB_Score, dtype: float64

In [17]:
df.groupby(df['Parent_Education_Level'].isna())['JAMB_Score'].mean()

Parent_Education_Level
False    176.569482
True     162.569024
Name: JAMB_Score, dtype: float64

So we can strongly infer that the missing values means that the parent were most likely uneducated and we have a strong relationship between education level and score and dropping that column would defintely bias the model

In [18]:
#systematic replacing
df["Parent_Education_Level"] = df["Parent_Education_Level"].fillna("Uneducated")

In [19]:
df.isna().sum()

JAMB_Score                      0
Study_Hours_Per_Week            0
Attendance_Rate                 0
Teacher_Quality                 0
Distance_To_School              0
School_Type                     0
School_Location                 0
Extra_Tutorials                 0
Access_To_Learning_Materials    0
Parent_Involvement              0
IT_Knowledge                    0
Age                             0
Gender                          0
Socioeconomic_Status            0
Parent_Education_Level          0
Assignments_Completed           0
dtype: int64

In [20]:
df.tail()

Unnamed: 0,JAMB_Score,Study_Hours_Per_Week,Attendance_Rate,Teacher_Quality,Distance_To_School,School_Type,School_Location,Extra_Tutorials,Access_To_Learning_Materials,Parent_Involvement,IT_Knowledge,Age,Gender,Socioeconomic_Status,Parent_Education_Level,Assignments_Completed
4995,183,20,74,2,10.6,Public,Urban,Yes,No,Low,Low,16,Male,Medium,Primary,2
4996,179,0,80,2,20.0,Public,Rural,No,Yes,Medium,Medium,22,Male,Low,Secondary,1
4997,261,17,89,3,11.3,Public,Urban,No,No,Low,High,18,Male,Medium,Primary,3
4998,183,15,96,2,15.9,Public,Rural,No,No,Low,Medium,18,Male,Medium,Secondary,1
4999,218,34,100,1,7.0,Public,Urban,Yes,Yes,Medium,Medium,16,Female,High,Uneducated,2


In [21]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [22]:
df['Gender'] = df['Gender'].apply(lambda x: 1 if x == 'Male' else 0)
df['School_Location'] = df["School_Location"].apply(lambda x: 1 if x == "Urban" else 0)
df["School_Type"] = df["School_Type"].apply(lambda x: 1 if x == "Private" else 0)


In [23]:
for cols in df.columns:
    if len(df[cols].unique()) == 2:
        df[cols]= le.fit_transform(df[cols])
df

Unnamed: 0,JAMB_Score,Study_Hours_Per_Week,Attendance_Rate,Teacher_Quality,Distance_To_School,School_Type,School_Location,Extra_Tutorials,Access_To_Learning_Materials,Parent_Involvement,IT_Knowledge,Age,Gender,Socioeconomic_Status,Parent_Education_Level,Assignments_Completed
0,192,22,78,4,12.4,0,1,1,1,High,Medium,17,1,Low,Tertiary,2
1,207,14,88,4,2.7,0,0,0,1,High,High,15,1,High,Uneducated,1
2,182,29,87,2,9.6,0,0,1,1,High,Medium,20,0,High,Tertiary,2
3,210,29,99,2,2.6,0,1,0,1,Medium,High,22,0,Medium,Tertiary,1
4,199,12,98,3,8.8,0,1,0,1,Medium,Medium,22,0,Medium,Tertiary,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,183,20,74,2,10.6,0,1,1,0,Low,Low,16,1,Medium,Primary,2
4996,179,0,80,2,20.0,0,0,0,1,Medium,Medium,22,1,Low,Secondary,1
4997,261,17,89,3,11.3,0,1,0,0,Low,High,18,1,Medium,Primary,3
4998,183,15,96,2,15.9,0,0,0,0,Low,Medium,18,1,Medium,Secondary,1


In [24]:
cat_var = ["Parent_Involvement", "IT_Knowledge", "Socioeconomic_Status", "Parent_Education_Level"]

In [25]:
target = df["JAMB_Score"]
target


0       192
1       207
2       182
3       210
4       199
       ... 
4995    183
4996    179
4997    261
4998    183
4999    218
Name: JAMB_Score, Length: 5000, dtype: int64

In [26]:
df.drop(columns=["JAMB_Score"], inplace=True)
df

Unnamed: 0,Study_Hours_Per_Week,Attendance_Rate,Teacher_Quality,Distance_To_School,School_Type,School_Location,Extra_Tutorials,Access_To_Learning_Materials,Parent_Involvement,IT_Knowledge,Age,Gender,Socioeconomic_Status,Parent_Education_Level,Assignments_Completed
0,22,78,4,12.4,0,1,1,1,High,Medium,17,1,Low,Tertiary,2
1,14,88,4,2.7,0,0,0,1,High,High,15,1,High,Uneducated,1
2,29,87,2,9.6,0,0,1,1,High,Medium,20,0,High,Tertiary,2
3,29,99,2,2.6,0,1,0,1,Medium,High,22,0,Medium,Tertiary,1
4,12,98,3,8.8,0,1,0,1,Medium,Medium,22,0,Medium,Tertiary,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,20,74,2,10.6,0,1,1,0,Low,Low,16,1,Medium,Primary,2
4996,0,80,2,20.0,0,0,0,1,Medium,Medium,22,1,Low,Secondary,1
4997,17,89,3,11.3,0,1,0,0,Low,High,18,1,Medium,Primary,3
4998,15,96,2,15.9,0,0,0,0,Low,Medium,18,1,Medium,Secondary,1


In [27]:
X = pd.get_dummies(df, columns=cat_var, drop_first=True, dtype=int)
X.iloc[1, :]

Study_Hours_Per_Week                 14.0
Attendance_Rate                      88.0
Teacher_Quality                       4.0
Distance_To_School                    2.7
School_Type                           0.0
School_Location                       0.0
Extra_Tutorials                       0.0
Access_To_Learning_Materials          1.0
Age                                  15.0
Gender                                1.0
Assignments_Completed                 1.0
Parent_Involvement_Low                0.0
Parent_Involvement_Medium             0.0
IT_Knowledge_Low                      0.0
IT_Knowledge_Medium                   0.0
Socioeconomic_Status_Low              0.0
Socioeconomic_Status_Medium           0.0
Parent_Education_Level_Secondary      0.0
Parent_Education_Level_Tertiary       0.0
Parent_Education_Level_Uneducated     1.0
Name: 1, dtype: float64

In [28]:
target

0       192
1       207
2       182
3       210
4       199
       ... 
4995    183
4996    179
4997    261
4998    183
4999    218
Name: JAMB_Score, Length: 5000, dtype: int64

WE CAN DO USER CONVERSION INTO 1,2,3,4 etc or 0,1 in javascript after user select option from drop down

In [29]:
"""from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat_var', OneHotEncoder(sparse_output=False, drop='first'), cat_var)
    ],
    remainder="passthrough"
)
df = preprocessor.fit_transform(df)
df"""


'from sklearn.preprocessing import OneHotEncoder\nfrom sklearn.compose import ColumnTransformer\npreprocessor = ColumnTransformer(\n    transformers=[\n        (\'cat_var\', OneHotEncoder(sparse_output=False, drop=\'first\'), cat_var)\n    ],\n    remainder="passthrough"\n)\ndf = preprocessor.fit_transform(df)\ndf'

In [30]:
from sklearn.preprocessing import StandardScaler

In [31]:
scaler = StandardScaler()

X_norm = scaler.fit_transform(X)
X_norm

array([[ 0.25730762, -0.65739287,  1.50040784, ..., -0.67216087,
         1.76212664, -0.46566198],
       [-0.57311877,  0.39693236,  1.50040784, ..., -0.67216087,
        -0.5674961 ,  2.14748045],
       [ 0.9839307 ,  0.29149984, -0.52909012, ..., -0.67216087,
         1.76212664, -0.46566198],
       ...,
       [-0.26170888,  0.50236488,  0.48565886, ..., -0.67216087,
        -0.5674961 , -0.46566198],
       [-0.46931547,  1.24039254, -0.52909012, ...,  1.48773909,
        -0.5674961 , -0.46566198],
       [ 1.50294719,  1.66212264, -1.5438391 , ..., -0.67216087,
        -0.5674961 ,  2.14748045]], shape=(5000, 20))

In [32]:
from sklearn.model_selection import cross_val_score


In [33]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import _multilayer_perceptron
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X_norm, target, test_size=0.2, random_state=42)

In [53]:
# Grid-search-ready models dictionary for regression tasks
# Each key maps to {'model': estimator_or_pipeline, 'params': parameter_grid}

from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

# Build model+param grids. Adjust scoring when running GridSearchCV (e.g., 'r2' or 'neg_mean_squared_error').
models_grid = {
    'linear_reg': {
        'model': LinearRegression(),
        'params': {
            'fit_intercept': [True, False]
        }
    },

    'random_forest': {
        'model': RandomForestRegressor(random_state=42),
        'params': {
            'n_estimators': [100, 200],
            'max_depth': [None, 6, 12],
            'min_samples_split': [2, 5]
        }
    },

    'knn': {
        'model': KNeighborsRegressor(),
        'params': {
            'n_neighbors': [3, 5, 8],
            'weights': ['uniform', 'distance'],
            'p': [1, 2]
        }
    },

    'mlp': {
        'model': MLPRegressor(max_iter=1000, random_state=42),
        'params': {
            'hidden_layer_sizes': [(50,), (100,), (50, 50)],
            'activation': ['relu', 'tanh'],
            'alpha': [1e-4, 1e-3], 
            'learning_rate_init': [1e-3, 1e-2]
        }
    }
}

# Example usage (uncomment to run):
# for name, spec in models_grid.items():
#     print(f"Grid-search for: {name}")
#     gs = GridSearchCV(spec['model'], spec['params'], cv=5, scoring='r2', n_jobs=-1)
#     gs.fit(X_train, y_train)
#     print(name, "best score:", gs.best_score_, "best params:", gs.best_params_)

models_grid

{'linear_reg': {'model': LinearRegression(),
  'params': {'fit_intercept': [True, False]}},
 'random_forest': {'model': RandomForestRegressor(random_state=42),
  'params': {'n_estimators': [100, 200],
   'max_depth': [None, 6, 12],
   'min_samples_split': [2, 5]}},
 'knn': {'model': KNeighborsRegressor(),
  'params': {'n_neighbors': [3, 5, 8],
   'weights': ['uniform', 'distance'],
   'p': [1, 2]}},
 'mlp': {'model': MLPRegressor(max_iter=1000, random_state=42),
  'params': {'hidden_layer_sizes': [(50,), (100,), (50, 50)],
   'activation': ['relu', 'tanh'],
   'alpha': [0.0001, 0.001],
   'learning_rate_init': [0.001, 0.01]}}}

In [54]:
store_score = []

for name, item in models_grid.items():
    print(f"Grid_Search for: {name}")
    gs = GridSearchCV(
        estimator= item['model'],
        param_grid= item['params'],
        n_jobs=-1,
        cv=5,
        scoring="r2",
        return_train_score=True
    )
    gs.fit(X_train, y_train)
    gs.best_estimator_
    store_score.append(f"model: {name}, best_estimator:{gs.best_estimator_}, score:{gs.score(X_test, y_test)}, best_score:{gs.best_score_}")

Grid_Search for: linear_reg
Grid_Search for: random_forest
Grid_Search for: knn
Grid_Search for: mlp




In [55]:
store_score

['model: linear_reg, best_estimator:LinearRegression(), score:0.3671809164160805, best_score:0.3292955694293226',
 'model: random_forest, best_estimator:RandomForestRegressor(max_depth=6, min_samples_split=5, n_estimators=200,\n                      random_state=42), score:0.30248685024342525, best_score:0.28137843366891924',
 "model: knn, best_estimator:KNeighborsRegressor(n_neighbors=8, weights='distance'), score:0.2249698562210133, best_score:0.1807119771154683",
 'model: mlp, best_estimator:MLPRegressor(alpha=0.001, hidden_layer_sizes=(50,), max_iter=1000,\n             random_state=42), score:0.31456328085170604, best_score:0.27201573974230736']

At this point I'll have to analyze features myself and combine them creatively to form new features that can show stronger correlation with the jamb score in a new notebook