# Student Performance Factors

## Random Forest Regressor

#### Install dependencies

In [12]:
import sys
!{sys.executable} -m pip install scikit-learn
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install tabulate
!{sys.executable} -m pip install openpyxl
!{sys.executable} -m pip install xgboost

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


#### Utility function for getting MAE for a given dataset

In [13]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

#### Load the .csv file with student performance data, output summary statistics

In [48]:
import pandas as pd
data = pd.read_csv('StudentPerformanceFactors.csv')
data.head()

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Near,Male,67
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Moderate,Female,61
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Near,Male,74
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Moderate,Male,71
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Near,Female,70


#### Split training and validation data, categorical and numerical columns

In [15]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
data = data.dropna(axis=0)
y = data['Exam_Score']
X = data.drop(columns=['Exam_Score'], axis=1)

train_X_full, val_X_full, train_y, valid_y = train_test_split(X, y,random_state = 0)

categorical_cols = [cname for cname in train_X_full.columns if train_X_full[cname].nunique() < 10 and 
                        train_X_full[cname].dtype == "object"]
numerical_cols = [cname for cname in train_X_full.columns if train_X_full[cname].dtype in ['int64', 'float64']]

my_cols = categorical_cols + numerical_cols
train_X = train_X_full[my_cols].copy()
valid_X = val_X_full[my_cols].copy()

#### Let's try encoding categorical variables ordinally to see if this helps our MAE

In [16]:
from sklearn.preprocessing import OrdinalEncoder

# Make copy to avoid changing original data 
label_train_X = train_X.copy()
label_valid_X = valid_X.copy()

# Apply ordinal encoder
ordinal_encoder = OrdinalEncoder()
label_train_X[categorical_cols] = ordinal_encoder.fit_transform(train_X[categorical_cols])
label_valid_X[categorical_cols] = ordinal_encoder.transform(valid_X[categorical_cols])

print("MAE from Approach 2 (Ordinal Encoding):") 
print(score_dataset(label_train_X, label_valid_X, train_y, valid_y))

MAE from Approach 2 (Ordinal Encoding):
1.2350595611285267


#### Let's try encoding categorical variables with one-hot to see if this helps our MAE

In [17]:
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
oh_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
oh_train_cols = pd.DataFrame(oh_encoder.fit_transform(train_X[categorical_cols]))
oh_valid_cols = pd.DataFrame(oh_encoder.transform(valid_X[categorical_cols]))

# One-hot encoding removed index; put it back
oh_train_cols.index = train_X.index
oh_valid_cols.index = valid_X.index

# Remove categorical columns (will replace with one-hot encoding)
num_train_X = train_X.drop(categorical_cols, axis=1)
num_valid_X = valid_X.drop(categorical_cols, axis=1)

# Add one-hot encoded columns to numerical features
oh_train_x = pd.concat([num_train_X, oh_train_cols], axis=1)
oh_valid_x = pd.concat([num_valid_X, oh_valid_cols], axis=1)

# Ensure all columns have string type
oh_train_x.columns = oh_train_x.columns.astype(str)
oh_valid_x.columns = oh_valid_x.columns.astype(str)

print("MAE from Approach 3 (One-Hot Encoding):") 
print(score_dataset(oh_train_x, oh_valid_x, train_y, valid_y))

MAE from Approach 3 (One-Hot Encoding):
1.192075235109718


#### Create a pipeline using One-Hot encoding and double-check our MAE

In [18]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

model = RandomForestRegressor(n_estimators=100, random_state=0)
data_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
    ]
)

# Preprocessing of training data, fit model 
data_pipeline.fit(train_X, train_y)

# Preprocessing of validation data, get predictions
preds = data_pipeline.predict(valid_X)

# Evaluate the model
score = mean_absolute_error(valid_y, preds)
print('MAE:', score)

MAE: 1.192075235109718


#### Run a cross-validation with 5 folds and get a mean MAE

In [None]:
from sklearn.model_selection import cross_val_score

scores = -1 * cross_val_score(data_pipeline, X, y,
                              cv=5,
                              scoring='neg_mean_absolute_error')

print("Mean MAE:", scores.mean())

Mean MAE: 1.141177564693589


## XGB Regressor

#### Run an XGBRegressor model & compare the MAE

In [51]:
from xgboost import XGBRegressor

X[categorical_cols] = X[categorical_cols].astype("category")

train_X_full_xgb, valid_X_full_xgb, train_y, valid_y = train_test_split(X, y,random_state = 0)

my_cols = categorical_cols + numerical_cols
train_X_xgb = train_X_full_xgb[my_cols].copy()
valid_X_xgb = valid_X_full_xgb[my_cols].copy()

xgb_model = XGBRegressor(tree_method="hist", enable_categorical=True, n_estimators=1000, learning_rate=0.05, early_stopping_rounds=5)
xgb_model.fit(train_X_xgb, train_y, eval_set=[(valid_X_xgb, valid_y)], verbose=False)

predictions = xgb_model.predict(valid_X_xgb)
print("Mean Absolute Error: " + str(mean_absolute_error(predictions, valid_y)))

Mean Absolute Error: 0.9041096568107605
