# Student Performance Factors

## Random Forest Regressor

#### Install dependencies

In [45]:
import sys
!{sys.executable} -m pip install scikit-learn
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install tabulate
!{sys.executable} -m pip install openpyxl
!{sys.executable} -m pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.5-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.5-py3-none-win_amd64.whl (56.8 MB)
   ---------------------------------------- 0.0/56.8 MB ? eta -:--:--
   - -------------------------------------- 2.6/56.8 MB 19.5 MB/s eta 0:00:03
   ------- -------------------------------- 10.2/56.8 MB 29.4 MB/s eta 0:00:02
   ------------ --------------------------- 18.1/56.8 MB 32.0 MB/s eta 0:00:02
   ----------------- ---------------------- 24.9/56.8 MB 33.1 MB/s eta 0:00:01
   ----------------------- ---------------- 33.8/56.8 MB 34.4 MB/s eta 0:00:01
   ---------------------------- ----------- 40.9/56.8 MB 34.1 MB/s eta 0:00:01
   ---------------------------------- ----- 49.0/56.8 MB 35.0 MB/s eta 0:00:01
   ---------------------------------------  56.1/56.8 MB 35.3 MB/s eta 0:00:01
   ---------------------------------------- 56.8/56.8 MB 32.8 MB/s  0:00:01
Installing collected packages: xgboost
Successfully installed xgboost-3.0.5


#### Utility function for getting MAE for a given dataset

In [38]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

#### Load the .csv file with student performance data, output summary statistics

In [15]:
import pandas as pd
data = pd.read_csv('StudentPerformanceFactors.csv')
data.describe()

Unnamed: 0,Hours_Studied,Attendance,Sleep_Hours,Previous_Scores,Tutoring_Sessions,Physical_Activity,Exam_Score
count,6607.0,6607.0,6607.0,6607.0,6607.0,6607.0,6607.0
mean,19.975329,79.977448,7.02906,75.070531,1.493719,2.96761,67.235659
std,5.990594,11.547475,1.46812,14.399784,1.23057,1.031231,3.890456
min,1.0,60.0,4.0,50.0,0.0,0.0,55.0
25%,16.0,70.0,6.0,63.0,1.0,2.0,65.0
50%,20.0,80.0,7.0,75.0,1.0,3.0,67.0
75%,24.0,90.0,8.0,88.0,2.0,4.0,69.0
max,44.0,100.0,10.0,100.0,8.0,6.0,101.0


#### Split training and validation data, categorical and numerical columns

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
data = data.dropna(axis=0)
y = data['Exam_Score']
X = data.drop(columns=['Exam_Score'], axis=1)

train_X_full, val_X_full, train_y, val_y = train_test_split(X, y,random_state = 0)

categorical_cols = [cname for cname in train_X_full.columns if train_X_full[cname].nunique() < 10 and 
                        train_X_full[cname].dtype == "object"]
numerical_cols = [cname for cname in train_X_full.columns if train_X_full[cname].dtype in ['int64', 'float64']]

my_cols = categorical_cols + numerical_cols
train_X = train_X_full[my_cols].copy()
valid_X = val_X_full[my_cols].copy()

#### Let's try encoding categorical variables ordinally to see if this helps our MAE

In [32]:
from sklearn.preprocessing import OrdinalEncoder

# Make copy to avoid changing original data 
label_train_X = train_X.copy()
label_valid_X = valid_X.copy()

# Apply ordinal encoder
ordinal_encoder = OrdinalEncoder()
label_train_X[categorical_cols] = ordinal_encoder.fit_transform(train_X[categorical_cols])
label_valid_X[categorical_cols] = ordinal_encoder.transform(valid_X[categorical_cols])

print("MAE from Approach 2 (Ordinal Encoding):") 
print(score_dataset(label_train_X, label_valid_X, train_y, val_y))

MAE from Approach 2 (Ordinal Encoding):
1.2350595611285267


#### Let's try encoding categorical variables with one-hot to see if this helps our MAE

In [37]:
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
oh_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
oh_train_cols = pd.DataFrame(oh_encoder.fit_transform(train_X[categorical_cols]))
oh_valid_cols = pd.DataFrame(oh_encoder.transform(valid_X[categorical_cols]))

# One-hot encoding removed index; put it back
oh_train_cols.index = train_X.index
oh_valid_cols.index = valid_X.index

# Remove categorical columns (will replace with one-hot encoding)
num_train_X = train_X.drop(categorical_cols, axis=1)
num_valid_X = valid_X.drop(categorical_cols, axis=1)

# Add one-hot encoded columns to numerical features
oh_train_x = pd.concat([num_train_X, oh_train_cols], axis=1)
oh_valid_x = pd.concat([num_valid_X, oh_valid_cols], axis=1)

# Ensure all columns have string type
oh_train_x.columns = oh_train_x.columns.astype(str)
oh_valid_x.columns = oh_valid_x.columns.astype(str)

print("MAE from Approach 3 (One-Hot Encoding):") 
print(score_dataset(oh_train_x, oh_valid_x, train_y, val_y))

MAE from Approach 3 (One-Hot Encoding):
1.192075235109718


#### Create a pipeline using One-Hot encoding and double-check our MAE

In [39]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

model = RandomForestRegressor(n_estimators=100, random_state=0)
data_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
    ]
)

# Preprocessing of training data, fit model 
data_pipeline.fit(train_X, train_y)

# Preprocessing of validation data, get predictions
preds = data_pipeline.predict(valid_X)

# Evaluate the model
score = mean_absolute_error(val_y, preds)
print('MAE:', score)

MAE: 1.192075235109718


#### Run a cross-validation with 5 folds and get a mean MAE

In [44]:
from sklearn.model_selection import cross_val_score

scores = -1 * cross_val_score(data_pipeline, X, y,
                              cv=5,
                              scoring='neg_mean_absolute_error')

print("Mean MAE:", scores.mean())

Mean MAE: 1.141177564693589


## XGB Regressor