In [1]:
# Basic Imports
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Modelling

from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import warnings



In [2]:
df = pd.read_csv('college_student.csv') # To create dataframe
df

Unnamed: 0,student_id,age,gender,major,GPA,course_load,avg_course_grade,attendance_rate,enrollment_status,lms_logins_past_month,avg_session_duration_minutes,assignment_submission_rate,forum_participation_count,video_completion_rate,risk_level
0,S001,24,Other,Computer Science,2.42,5,67.2,0.71,Graduated,32,33,0.70,8,0.74,High
1,S002,21,Male,Arts,3.73,6,64.4,0.84,Leave,29,53,0.91,13,0.85,Medium
2,S003,22,Male,Computer Science,2.80,3,95.3,0.89,Graduated,34,69,0.58,18,0.99,Medium
3,S004,24,Male,Arts,2.59,4,73.7,0.98,Graduated,22,18,0.91,6,0.72,Medium
4,S005,20,Other,Computer Science,2.30,4,87.4,0.95,Active,9,11,0.77,15,0.76,High
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1540,S1541,24,Male,Business,3.61,6,70.5,0.77,Leave,26,64,0.87,13,0.51,Medium
1541,S1542,24,Other,Business,2.80,4,77.4,0.96,Graduated,12,19,0.61,2,0.55,Medium
1542,S1543,21,Male,Computer Science,3.61,5,66.3,0.99,Leave,1,89,0.92,18,0.65,Low
1543,S1544,22,Male,Arts,3.18,4,97.9,0.85,Graduated,20,15,0.71,19,0.78,Low


In [3]:
df.head() # It will shows top 5 records.

Unnamed: 0,student_id,age,gender,major,GPA,course_load,avg_course_grade,attendance_rate,enrollment_status,lms_logins_past_month,avg_session_duration_minutes,assignment_submission_rate,forum_participation_count,video_completion_rate,risk_level
0,S001,24,Other,Computer Science,2.42,5,67.2,0.71,Graduated,32,33,0.7,8,0.74,High
1,S002,21,Male,Arts,3.73,6,64.4,0.84,Leave,29,53,0.91,13,0.85,Medium
2,S003,22,Male,Computer Science,2.8,3,95.3,0.89,Graduated,34,69,0.58,18,0.99,Medium
3,S004,24,Male,Arts,2.59,4,73.7,0.98,Graduated,22,18,0.91,6,0.72,Medium
4,S005,20,Other,Computer Science,2.3,4,87.4,0.95,Active,9,11,0.77,15,0.76,High


In [4]:
x = df.drop(columns = ['age'], axis = 1)
x.head()

Unnamed: 0,student_id,gender,major,GPA,course_load,avg_course_grade,attendance_rate,enrollment_status,lms_logins_past_month,avg_session_duration_minutes,assignment_submission_rate,forum_participation_count,video_completion_rate,risk_level
0,S001,Other,Computer Science,2.42,5,67.2,0.71,Graduated,32,33,0.7,8,0.74,High
1,S002,Male,Arts,3.73,6,64.4,0.84,Leave,29,53,0.91,13,0.85,Medium
2,S003,Male,Computer Science,2.8,3,95.3,0.89,Graduated,34,69,0.58,18,0.99,Medium
3,S004,Male,Arts,2.59,4,73.7,0.98,Graduated,22,18,0.91,6,0.72,Medium
4,S005,Other,Computer Science,2.3,4,87.4,0.95,Active,9,11,0.77,15,0.76,High


In [5]:
# Define the how many categarical values in each categary :

print("Categaries in 'gender' variable:", end = " ")
print(df['gender'].unique())

print("Categaries in 'major' variable:", end = " ")
print(df['major'].unique())

print("Categaries in 'enrollment_status' variable:", end = " ")
print(df['enrollment_status'].unique())

print("Categaries in 'risk_level' variable:", end = " ")
print(df['risk_level'].unique())


Categaries in 'gender' variable: ['Other' 'Male' 'Female']
Categaries in 'major' variable: ['Computer Science' 'Arts' 'Engineering' 'Business']
Categaries in 'enrollment_status' variable: ['Graduated' 'Leave' 'Active']
Categaries in 'risk_level' variable: ['High' 'Medium' 'Low']


In [6]:
y = df['age']

In [7]:
# Create column transformer types :

num_features = x.select_dtypes(exclude ='object').columns
cat_features = x.select_dtypes(include = 'object').columns


In [8]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

num_transform = StandardScaler()
obj_transform = OneHotEncoder()

preprocessor = ColumnTransformer(
    [('OneHotEncoder', obj_transform, cat_features),
    ('StandardScaler', num_transform, num_features),
    ]
)

In [9]:
x = preprocessor.fit_transform(x)
x

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 21630 stored elements and shape (1545, 1567)>

In [10]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state = 42)
x_train.shape, x_test.shape

((1236, 1567), (309, 1567))

# Create an Evaluate Function to give all metrics after model training :


In [11]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true,predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    
    return mae,rmse,r2_square

In [12]:
models = {
    "Linear_Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "KNeighbors_Regressor": KNeighborsRegressor(),
    "Decision_Tree": DecisionTreeRegressor(),
    "RandomForest_Regressor": RandomForestRegressor(),
    "XGB_Regressor": XGBRegressor(),
    "CatBoost_Regressor": CatBoostRegressor(verbose=False)
}

models_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train, y_train) # Train Model
    
    # Make Predictions :
    
    y_trained_predict = model.predict(x_train)
    y_test_predict = model.predict(x_test)
    
    # Evaluate Train and Test Dataset :
    
    train_mae, train_rmse, train_r2 = evaluate_model(y_train, y_trained_predict)

    
     
    test_mae, test_rmse,  test_r2 =  evaluate_model(y_test, y_test_predict)
    
    
    print(list(models.keys())[i])
    
    models_list.append(list(models.keys())[i])
    
    print('Model Performance for Training Set:')
    
    
    
    print('_ Root Mean Squared Error: {:.4f}'.format(train_rmse))
    print('_ Mean Absolute Error: {:.4f}'.format(train_mae))
    print('_ R2_Score: {:.4f}'.format(train_r2))
    
    print('.....................')
    
    
    print('Model Performance for Test Set:')
    
    print("- Root Mean Squared Error: {:.4f}".format(test_rmse))
    print("_ Mean Absolute Error: {:.4f}".format(test_mae))
    print("_ R2_Score: {:.4f}".format(test_r2))    
    
    r2_list.append(test_r2)
    
    print('*'*35)
    print('\n')
    

Linear_Regression
Model Performance for Training Set:
_ Root Mean Squared Error: 0.0002
_ Mean Absolute Error: 0.0001
_ R2_Score: 1.0000
.....................
Model Performance for Test Set:
- Root Mean Squared Error: 2.3074
_ Mean Absolute Error: 1.9954
_ R2_Score: -0.0291
***********************************


Lasso
Model Performance for Training Set:
_ Root Mean Squared Error: 2.3054
_ Mean Absolute Error: 2.0073
_ R2_Score: 0.0000
.....................
Model Performance for Test Set:
- Root Mean Squared Error: 2.2765
_ Mean Absolute Error: 1.9822
_ R2_Score: -0.0017
***********************************


Ridge
Model Performance for Training Set:
_ Root Mean Squared Error: 1.1414
_ Mean Absolute Error: 0.9821
_ R2_Score: 0.7549
.....................
Model Performance for Test Set:
- Root Mean Squared Error: 2.3072
_ Mean Absolute Error: 1.9953
_ R2_Score: -0.0289
***********************************


KNeighbors_Regressor
Model Performance for Training Set:
_ Root Mean Squared Error: 2

# Results :

In [None]:
pd.Dataframe(list(zip(models_list, r2_list)), columns = ['Model_Name', 'R2_Score']).sort_values(by = ['R2_Score'], ascending = False)

AttributeError: module 'pandas' has no attribute 'dataframe'