<a href="https://colab.research.google.com/github/NIDURANGANI/Colab/blob/main/Lab_Sheet_Regression_Based_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Step 1: Import Required Libraries


In [8]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

# Step 2: Load and Inspect Dataset

In [9]:
import pandas as pd
df = pd.read_excel("/content/Students_Performance_data_set (1).xlsx")
df.head()

Unnamed: 0,University Admission year,Gender,Age,H.S.C passing year,Program,Current Semester,Do you have meritorious scholarship ?,Do you use University transportation?,How many hour do you study daily?,How many times do you seat for study in a day?,...,What is you interested area?,What is your relationship status?,Are you engaged with any co-curriculum activities?,With whom you are living with?,Do you have any health issues?,What was your previous SGPA?,Do you have any physical disabilities?,What is your current CGPA?,How many Credit did you have completed?,What is your monthly family income?
0,2018,Male,24,2016,BCSE,12,Yes,No,3,2,...,Data Schince,Single,Yes,Bachelor,No,2.68,No,3.15,75,25000
1,2021,Male,22,2020,BCSE,4,Yes,Yes,3,2,...,Event management,Single,Yes,Family,No,2.68,No,3.15,36,100000
2,2020,Female,21,2019,BCSE,5,No,No,3,3,...,Software,Single,No,Bachelor,No,2.68,No,3.15,50,50000
3,2021,Male,20,2020,BCSE,4,Yes,No,1,3,...,Artificial Intelligence,Single,No,Bachelor,Yes,2.68,No,3.15,36,62488
4,2021,Male,22,2019,BCSE,4,Yes,No,3,1,...,Software,Relationship,No,Bachelor,Yes,2.68,No,3.15,36,50000


# Step 3: Data Preprocessing (Handle missing values, encode, normalize)

In [11]:
# Identify target and feature
target_col = 'What is your current CGPA?'
one_hot_col = 'Status of your English language proficiency'

# Separate column types
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numerical_cols.remove(target_col)
categorical_cols.remove(one_hot_col)

# Convert 'Average attendance on class' to numeric and re-impute
if 'Average attendance on class' in categorical_cols:
    df['Average attendance on class'] = pd.to_numeric(df['Average attendance on class'], errors='coerce')
    categorical_cols.remove('Average attendance on class')
    numerical_cols.append('Average attendance on class')


# Impute missing values
df[numerical_cols] = SimpleImputer(strategy='median').fit_transform(df[numerical_cols])
df[categorical_cols + [one_hot_col]] = SimpleImputer(strategy='most_frequent').fit_transform(df[categorical_cols + [one_hot_col]])

# Inspect data types in categorical columns before encoding
print("Data types in categorical columns before encoding:")
for col in categorical_cols:
    print(f"{col}: {df[col].apply(type).unique()}")

# Encode categorical data
for col in categorical_cols:
    df[col] = LabelEncoder().fit_transform(df[col])
df = pd.get_dummies(df, columns=[one_hot_col], prefix="English")

# Normalize
scaler = MinMaxScaler()
df[numerical_cols + [target_col]] = scaler.fit_transform(df[numerical_cols + [target_col]])

Data types in categorical columns before encoding:
Gender: [<class 'str'>]
Program: [<class 'str'>]
Do you have meritorious scholarship ?: [<class 'str'>]
Do you use University transportation?: [<class 'str'>]
What is your preferable learning mode?: [<class 'str'>]
Do you use smart phone?: [<class 'str'>]
Do you have personal Computer?: [<class 'str'>]
Did you ever fall in probation?: [<class 'str'>]
Did you ever got suspension?: [<class 'str'>]
Do you attend in teacher consultancy for any kind of academical problems?: [<class 'str'>]
What are the skills do you have ?: [<class 'str'>]
What is you interested area?: [<class 'str'>]
What is your relationship status?: [<class 'str'>]
Are you engaged with any co-curriculum activities?: [<class 'str'>]
With whom you are living with?: [<class 'str'>]
Do you have any health issues?: [<class 'str'>]
Do you have any physical disabilities?: [<class 'str'>]


# Step 4: Train-Test Split

In [12]:
X = df.drop(columns=[target_col])
y = df[target_col]
X_train, X_temp, y_train, y_temp = train_test_split(X, y,
test_size=0.30, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp,
test_size=0.50, random_state=42)


# Step 5: Model Training and Evaluation using XGBoost, LightGBM, MLP, Random Forest, SVM

In [13]:
from sklearn.metrics import mean_squared_error, r2_score

def evaluate_model(name, y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    print(f"{name} => MSE: {mse:.4f}, RMSE: {rmse:.4f}, R²: {r2:.4f}")
    return {"Model": name, "MSE": mse, "RMSE": rmse, "R2": r2}

## Train Models

In [14]:
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
results = []
# XGBoost
xgb_model = xgb.XGBRegressor(n_estimators=100, max_depth=4)
xgb_model.fit(X_train, y_train)
results.append(evaluate_model("XGBoost", y_test,
xgb_model.predict(X_test)))
# LightGBM
lgb_model = lgb.LGBMRegressor(n_estimators=200, learning_rate=0.05)
lgb_model.fit(X_train, y_train)
results.append(evaluate_model("LightGBM", y_test,
lgb_model.predict(X_test)))
# MLP
mlp_model = MLPRegressor(hidden_layer_sizes=(64,), max_iter=1000)
mlp_model.fit(X_train, y_train)
results.append(evaluate_model("MLP", y_test,
mlp_model.predict(X_test)))
# Random Forest

XGBoost => MSE: 0.0091, RMSE: 0.0956, R²: 0.7104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000584 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 502
[LightGBM] [Info] Number of data points in the train set: 835, number of used features: 30
[LightGBM] [Info] Start training from score 0.788756
LightGBM => MSE: 0.0092, RMSE: 0.0961, R²: 0.7075
MLP => MSE: 0.0258, RMSE: 0.1605, R²: 0.1839
