In [102]:
# import librearies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy import stats
import missingno as msno

import warnings
warnings.filterwarnings("ignore")


from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, MinMaxScaler


In [103]:
path = r"G:\VS CODE\students_marks-24\notebook\prepare_train.csv"
df_train = pd.read_csv(path)
df_train.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [104]:
# Separate features (X) and target (y)
X = df_train.drop(columns=['math_score'])
y = df_train['math_score']

from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [98]:
df_train.columns

Index(['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch',
       'test_preparation_course', 'math_score', 'reading_score',
       'writing_score'],
      dtype='object')

In [105]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder

# Define transformers for numeric and categorical columns
numeric_features = ['reading_score', 'writing_score']
cat_one_features = ['gender','test_preparation_course']
cat_label_features = ['race_ethnicity', 'parental_level_of_education', 'lunch']

# Combine transformers for numeric and categorical features using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('one', OneHotEncoder(), cat_one_features),
        ('ord', OrdinalEncoder(), cat_label_features),
    ],
    remainder='passthrough'
)


# 1. LogisticRegression

In [106]:
from sklearn.linear_model import LinearRegression

lr_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Fit the pipeline
lr_pipeline.fit(X_train, y_train)

In [107]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Make predictions on both the training and test sets
train_predictions = lr_pipeline.predict(X_train)
test_predictions = lr_pipeline.predict(X_test)

# Calculate R-squared scores
train_r2 = r2_score(y_train, train_predictions)
test_r2 = r2_score(y_test, test_predictions)

# Evaluate Mean Squared Errors
train_mse = mean_squared_error(y_train, train_predictions)
test_mse = mean_squared_error(y_test, test_predictions)

# Evaluate Root Mean Squared Errors
train_rmse = mean_squared_error(y_train, train_predictions, squared=False)
test_rmse = mean_squared_error(y_test, test_predictions, squared=False)


# Print the results
print(f'Training Set R-squared score for LR: {train_r2}')
print(f'Test Set R-squared score for LR: {test_r2}')
print(f'Training Set Mean Squared Error for LR: {train_mse}')
print(f'Test Set Mean Squared Error for LR: {test_mse}')

print(f'Training Set Root Mean Squared Error for LR: {train_rmse}')
print(f'Test Set Root Mean Squared Error for LR: {test_rmse}')

Training Set R-squared score for LR: 0.8675137097173807
Test Set R-squared score for LR: 0.8617897267279007
Training Set Mean Squared Error for LR: 30.527278821846846
Test Set Mean Squared Error for LR: 31.318669059894948
Training Set Root Mean Squared Error for LR: 5.525149665108344
Test Set Root Mean Squared Error for LR: 5.596308520792519


In [108]:
from sklearn.model_selection import cross_val_score, KFold

# Use KFold cross-validation with 5 folds
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation and get R-squared scores
cross_val_results = cross_val_score(lr_pipeline, X_train, y_train, cv=kf, scoring='r2')

# Print the R-squared scores for each fold
print("Cross-Validation R-squared scores for LR:", cross_val_results)

# Print the mean and standard deviation of R-squared scores
print("Mean R-squared for LR:", cross_val_results.mean())
print("Standard Deviation of R-squared for LR:", cross_val_results.std())

Cross-Validation R-squared scores for LR: [0.86515818 0.8783954  0.85489636 0.8480901  0.86777367]
Mean R-squared for LR: 0.8628627441953698
Standard Deviation of R-squared for LR: 0.010510411735958795


# Lasso

In [109]:
from sklearn.linear_model import Lasso
# Lasso Regression Pipeline
la_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', Lasso())
])

# Fit the pipeline
la_pipeline.fit(X_train, y_train)

In [110]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Make predictions on both the training and test sets
train_predictions = la_pipeline.predict(X_train)
test_predictions = la_pipeline.predict(X_test)

# Calculate R-squared scores
train_r2 = r2_score(y_train, train_predictions)
test_r2 = r2_score(y_test, test_predictions)

# Evaluate Mean Squared Errors
train_mse = mean_squared_error(y_train, train_predictions)
test_mse = mean_squared_error(y_test, test_predictions)

# Evaluate Root Mean Squared Errors
train_rmse = mean_squared_error(y_train, train_predictions, squared=False)
test_rmse = mean_squared_error(y_test, test_predictions, squared=False)

# Print the results
print(f'Training Set R-squared score for La: {train_r2}')
print(f'Test Set R-squared score for La: {test_r2}')
print(f'Training Set Mean Squared Error for La: {train_mse}')
print(f'Test Set Mean Squared Error for La: {test_mse}')
print(f'Training Set Root Mean Squared Error for La: {train_rmse}')
print(f'Test Set Root Mean Squared Error for La: {test_rmse}')

Training Set R-squared score for La: 0.8146852099249361
Test Set R-squared score for La: 0.8025934363089279
Training Set Mean Squared Error for La: 42.699937135877754
Test Set Mean Squared Error for La: 44.732643182898855
Training Set Root Mean Squared Error for La: 6.534518890926688
Test Set Root Mean Squared Error for La: 6.688246644891234


In [111]:
from sklearn.model_selection import cross_val_score, KFold

# Use KFold cross-validation with 5 folds
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation and get R-squared scores
cross_val_results = cross_val_score(la_pipeline, X_train, y_train, cv=kf, scoring='r2')

# Print the R-squared scores for each fold
print("Cross-Validation R-squared scores for La:", cross_val_results)

# Print the mean and standard deviation of R-squared scores
print("Mean R-squared for La:", cross_val_results.mean())
print("Standard Deviation of R-squared for La:", cross_val_results.std())

Cross-Validation R-squared scores for La: [0.84132021 0.80247965 0.79673972 0.80962323 0.80568642]
Mean R-squared for La: 0.8111698463535104
Standard Deviation of R-squared for La: 0.01565408770903558


# Ridge

In [112]:
from sklearn.linear_model import Ridge

# Ridge Regression Pipeline
ri_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', Ridge())
])

# Fit the pipeline
ri_pipeline.fit(X_train, y_train)

In [113]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Make predictions on both the training and test sets
train_predictions = ri_pipeline.predict(X_train)
test_predictions = ri_pipeline.predict(X_test)

# Calculate R-squared scores
train_r2 = r2_score(y_train, train_predictions)
test_r2 = r2_score(y_test, test_predictions)

# Evaluate Mean Squared Errors
train_mse = mean_squared_error(y_train, train_predictions)
test_mse = mean_squared_error(y_test, test_predictions)

# Evaluate Root Mean Squared Errors
train_rmse = mean_squared_error(y_train, train_predictions, squared=False)
test_rmse = mean_squared_error(y_test, test_predictions, squared=False)

# Print the results
print(f'Training Set R-squared score for Ri: {train_r2}')
print(f'Test Set R-squared score for La: {test_r2}')
print(f'Training Set Mean Squared Error for Ri: {train_mse}')
print(f'Test Set Mean Squared Error for Ri: {test_mse}')
print(f'Training Set Root Mean Squared Error for Ri: {train_rmse}')
print(f'Test Set Root Mean Squared Error for Ri: {test_rmse}')

Training Set R-squared score for Ri: 0.8675083988034364
Test Set R-squared score for La: 0.8618034630776855
Training Set Mean Squared Error for Ri: 30.528502554132142
Test Set Mean Squared Error for Ri: 31.315556381055536
Training Set Root Mean Squared Error for Ri: 5.5252604060018875
Test Set Root Mean Squared Error for Ri: 5.596030412806522


In [114]:
from sklearn.model_selection import cross_val_score, KFold

# Use KFold cross-validation with 5 folds
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation and get R-squared scores
cross_val_results = cross_val_score(ri_pipeline, X_train, y_train, cv=kf, scoring='r2')

# Print the R-squared scores for each fold
print("Cross-Validation R-squared scores for Ri:", cross_val_results)

# Print the mean and standard deviation of R-squared scores
print("Mean R-squared for Ri:", cross_val_results.mean())
print("Standard Deviation of R-squared for Ri:", cross_val_results.std())

Cross-Validation R-squared scores for Ri: [0.8657029  0.87818891 0.85468591 0.84834818 0.86760051]
Mean R-squared for Ri: 0.862905281115842
Standard Deviation of R-squared for Ri: 0.010420444949726106


#  RandomForestRegressor

In [115]:
from sklearn.ensemble import RandomForestRegressor

rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

rf_pipeline.fit(X_train, y_train)

In [117]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Make predictions on both the training and test sets
train_predictions = rf_pipeline.predict(X_train)
test_predictions = rf_pipeline.predict(X_test)

# Calculate R-squared scores
train_r2 = r2_score(y_train, train_predictions)
test_r2 = r2_score(y_test, test_predictions)

# Evaluate Mean Squared Errors
train_mse = mean_squared_error(y_train, train_predictions)
test_mse = mean_squared_error(y_test, test_predictions)

# Evaluate Root Mean Squared Errors
train_rmse = mean_squared_error(y_train, train_predictions, squared=False)
test_rmse = mean_squared_error(y_test, test_predictions, squared=False)

# Print the results
print(f'Training Set R-squared score for RF: {train_r2}')
print(f'Test Set R-squared score for RF: {test_r2}')
print(f'Training Set Mean Squared Error for RF: {train_mse}')
print(f'Test Set Mean Squared Error for RF: {test_mse}')
print(f'Training Set Root Mean Squared Error for RF: {train_rmse}')
print(f'Test Set Root Mean Squared Error for RF: {test_rmse}')

Training Set R-squared score for RF: 0.9769253585316069
Test Set R-squared score for RF: 0.8523690687875892
Training Set Mean Squared Error for RF: 5.316821931666667
Test Set Mean Squared Error for RF: 33.453405222222216
Training Set Root Mean Squared Error for RF: 2.3058234823304815
Test Set Root Mean Squared Error for RF: 5.783891875045921


In [118]:
from sklearn.model_selection import cross_val_score, KFold

# Use KFold cross-validation with 5 folds
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation and get R-squared scores
cross_val_results = cross_val_score(rf_pipeline, X_train, y_train, cv=kf, scoring='r2')

# Print the R-squared scores for each fold
print("Cross-Validation R-squared scores for RF:", cross_val_results)

# Print the mean and standard deviation of R-squared scores
print("Mean R-squared for RF:", cross_val_results.mean())
print("Standard Deviation of R-squared for RF:", cross_val_results.std())

Cross-Validation R-squared scores for RF: [0.82258531 0.82417398 0.82343044 0.78587505 0.85173237]
Mean R-squared for RF: 0.8215594288868463
Standard Deviation of R-squared for RF: 0.020953137140832887


# GradientBoostingRegressor

In [119]:
from sklearn.ensemble import GradientBoostingRegressor

# Create a pipeline with the preprocessor and a GradientBoostingRegressor
gb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor())
])

# Fit the pipeline on the training data
gb_pipeline.fit(X_train, y_train)

In [120]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Make predictions on both the training and test sets
train_predictions = gb_pipeline.predict(X_train)
test_predictions = gb_pipeline.predict(X_test)

# Calculate R-squared scores
train_r2 = r2_score(y_train, train_predictions)
test_r2 = r2_score(y_test, test_predictions)

# Evaluate Mean Squared Errors
train_mse = mean_squared_error(y_train, train_predictions)
test_mse = mean_squared_error(y_test, test_predictions)

# Evaluate Root Mean Squared Errors
train_rmse = mean_squared_error(y_train, train_predictions, squared=False)
test_rmse = mean_squared_error(y_test, test_predictions, squared=False)

# Print the results
print(f'Training Set R-squared score for GB: {train_r2}')
print(f'Test Set R-squared score for GB: {test_r2}')
print(f'Training Set Mean Squared Error for GB: {train_mse}')
print(f'Test Set Mean Squared Error for GB: {test_mse}')
print(f'Training Set Root Mean Squared Error for GB: {train_rmse}')
print(f'Test Set Root Mean Squared Error for GB: {test_rmse}')

Training Set R-squared score for GB: 0.9077499644369027
Test Set R-squared score for GB: 0.8640240896885281
Training Set Mean Squared Error for GB: 21.2561054502513
Test Set Mean Squared Error for GB: 30.812358838036044
Training Set Root Mean Squared Error for GB: 4.610434410145242
Test Set Root Mean Squared Error for GB: 5.550888112548842


In [121]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid to search
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__learning_rate': [0.01, 0.1, 0.2],
    'regressor__max_depth': [3, 4, 5],
    # Exclude min_child_weight for GradientBoostingRegressor
    # 'regressor__min_child_weight': [1, 2, 3],
}


# Create the GridSearchCV object
grid_search = GridSearchCV(gb_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions on the test set
test_predictions = best_model.predict(X_test)

Best Hyperparameters: {'regressor__learning_rate': 0.1, 'regressor__max_depth': 3, 'regressor__n_estimators': 100}


In [122]:
from sklearn.model_selection import cross_val_score, KFold

# Use KFold cross-validation with 5 folds
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation and get R-squared scores
cross_val_results = cross_val_score(gb_pipeline, X_train, y_train, cv=kf, scoring='r2')

# Print the R-squared scores for each fold
print("Cross-Validation R-squared scores for GB:", cross_val_results)

# Print the mean and standard deviation of R-squared scores
print("Mean R-squared for GB:", cross_val_results.mean())
print("Standard Deviation of R-squared for GB:", cross_val_results.std())

Cross-Validation R-squared scores for GB: [0.86558797 0.86332946 0.82659807 0.8167091  0.8646807 ]
Mean R-squared for GB: 0.84738106042734
Standard Deviation of R-squared for GB: 0.021250043831646623


#  XGBRegressor

In [123]:
from xgboost import XGBRegressor

# Create a pipeline with the preprocessor and an XGBRegressor
xgb_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                ('regressor', XGBRegressor())])

# Fit the pipeline on the training data
xgb_pipeline.fit(X_train, y_train)

ModuleNotFoundError: No module named 'xgboost'

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Make predictions on both the training and test sets
train_predictions = xgb_pipeline.predict(X_train)
test_predictions = xgb_pipeline.predict(X_test)

# Calculate R-squared scores
train_r2 = r2_score(y_train, train_predictions)
test_r2 = r2_score(y_test, test_predictions)

# Evaluate Mean Squared Errors
train_mse = mean_squared_error(y_train, train_predictions)
test_mse = mean_squared_error(y_test, test_predictions)

# Evaluate Root Mean Squared Errors
train_rmse = mean_squared_error(y_train, train_predictions, squared=False)
test_rmse = mean_squared_error(y_test, test_predictions, squared=False)

# Print the results
print(f'Training Set R-squared score for XGB: {train_r2}')
print(f'Test Set R-squared score for XGB: {test_r2}')
print(f'Training Set Mean Squared Error for XGB: {train_mse}')
print(f'Test Set Mean Squared Error for XGB: {test_mse}')
print(f'Training Set Root Mean Squared Error for XGB: {train_rmse}')
print(f'Test Set Root Mean Squared Error for XGB: {test_rmse}')

# 6. GradientBoostingClassifier

In [87]:
from sklearn.ensemble import GradientBoostingClassifier

gb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier())
])

gb_pipeline.fit(X_train, y_train)

In [88]:
y_pred_train_gb = gb_pipeline.predict(X_train)
y_pred_test_gb = gb_pipeline.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Assuming y_pred_train and y_pred_test are your predicted values
accuracy_train_gb = accuracy_score(y_train, y_pred_train_gb)
accuracy_test_gb = accuracy_score(y_test, y_pred_test_gb)

print(f"Accuracy_train_GB: {accuracy_train_gb}\nAccuracy_test_gb: {accuracy_test_gb}")

cm = confusion_matrix(y_test, y_pred_test_gb)
print(f"Confusion_matrix:\n {cm}")

# Rename the variable to avoid conflicts with the function name
classification_report_gb = classification_report(y_test, y_pred_test_gb)

print(f"Classification_report:\n{classification_report_gb}")

Accuracy_train_GB: 0.8322222222222222
Accuracy_test_gb: 0.68
Confusion_matrix:
 [[54  9]
 [23 14]]
Classification_report:
              precision    recall  f1-score   support

           0       0.70      0.86      0.77        63
           1       0.61      0.38      0.47        37

    accuracy                           0.68       100
   macro avg       0.65      0.62      0.62       100
weighted avg       0.67      0.68      0.66       100



In [23]:
from sklearn.model_selection import cross_val_score, StratifiedKFold


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Compute cross-validated scores
cross_val_scores = cross_val_score(gb_pipeline, X, y, cv=cv, scoring='accuracy')

# Print the mean and standard deviation of cross-validated scores
print("Cross-validated Accuracy: {:.2f} (+/- {:.2f})".format(cross_val_scores.mean(), cross_val_scores.std()))

Cross-validated Accuracy: 0.70 (+/- 0.01)


In [24]:
# Define the hyperparameter grid to search
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__learning_rate': [0.01, 0.1, 0.2],
    'classifier__max_depth': [3, 4, 5]
}

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(gb_pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Train the model with the best hyperparameters on the entire training set
best_gb_model = gb_pipeline.set_params(**best_params)
best_gb_model.fit(X_train, y_train)

# Evaluate on the validation set
y_val_pred = best_gb_model.predict(X_train)
accuracy = accuracy_score(y_train, y_val_pred)
print(f"Validation Accuracy: {accuracy}")

# Test the final model on the test set
y_test_pred = best_gb_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy}")

Best Hyperparameters: {'classifier__learning_rate': 0.1, 'classifier__max_depth': 3, 'classifier__n_estimators': 100}
Validation Accuracy: 0.84
Test Accuracy: 0.65
