In [21]:
import os.path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [22]:
import warnings
warnings.simplefilter('ignore')

In [23]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [24]:
print(os.path.exists("../data/study_performance.csv"))

True


In [25]:
ds = pd.read_csv("../data/study_performance.csv")

In [26]:
print('columns count - ',len(ds.columns), '\n')
print('columns: ',list(ds.columns))

columns count -  8 

columns:  ['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course', 'math_score', 'reading_score', 'writing_score']


## Data engineering

In [27]:
# Numerical data
numeric_columns = ds.select_dtypes(include=['float64', 'int64']).columns
for col in numeric_columns:
    if ds[col].isnull().any():
        median_value = ds[col].mode()[0]
        ds[col].fillna(median_value, inplace=True)

In [28]:
ds.isnull().sum().sum()

0

In [29]:
# Categorical
categorical_columns = ds.select_dtypes(include=['object']).columns

map_dicts = dict()
for column in categorical_columns:
    ds[column] = ds[column].astype('category')
    map_dicts[column] = dict(zip(ds[column], ds[column].cat.codes))
    ds[column] = ds[column].cat.codes

In [30]:
ds.isnull().sum().sum()

0

# Model tuning


#### Define target and features columns

In [31]:
y_column = ['math_score'] # target variable
X_columns = ['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course', 'reading_score', 'writing_score']
X = ds[X_columns]
y = ds[y_column]

In [32]:
# Let's say we want to split the data in 80:10:10 for train:valid:test dataset
# In the first step we will split the data in training and remaining dataset
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.8)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)


(800, 7)
(800, 1)
(200, 7)
(200, 1)


#### Building a Baseline Linear Regression Model

In [34]:
rf = LinearRegression()

In [35]:
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [37]:
rf.get_params()

{'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'positive': False}

#### Hyperparameter Tuning

In [39]:
# Define the parameter grid for Linear Regression
param_grid = {
    'fit_intercept': [True, False],
    'positive': [True, False]  
}

# Instantiate the Linear Regression model
lr = LinearRegression()

# Create GridSearchCV
grid = GridSearchCV(estimator=lr,
                    param_grid=param_grid,
                    scoring='neg_mean_squared_error',  # Use a regression metric
                    cv=5,
                    verbose=3,
                    return_train_score=True)

# Fit the grid search to the data
grid.fit(X_train, y_train)

# Print the best parameters and best score
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, -grid.best_score_))  # Use negative mean squared error as the score

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END fit_intercept=True, positive=True;, score=(train=-29.114, test=-35.075) total time=   0.0s
[CV 2/5] END fit_intercept=True, positive=True;, score=(train=-29.160, test=-35.358) total time=   0.0s
[CV 3/5] END fit_intercept=True, positive=True;, score=(train=-30.785, test=-28.398) total time=   0.0s
[CV 4/5] END fit_intercept=True, positive=True;, score=(train=-30.936, test=-27.638) total time=   0.0s
[CV 5/5] END fit_intercept=True, positive=True;, score=(train=-30.689, test=-28.838) total time=   0.0s
[CV 1/5] END fit_intercept=True, positive=False;, score=(train=-29.114, test=-35.075) total time=   0.0s
[CV 2/5] END fit_intercept=True, positive=False;, score=(train=-29.160, test=-35.358) total time=   0.0s
[CV 3/5] END fit_intercept=True, positive=False;, score=(train=-30.785, test=-28.398) total time=   0.0s
[CV 4/5] END fit_intercept=True, positive=False;, score=(train=-30.936, test=-27.638) total time=   0.0s


In [40]:
grid_results = pd.concat([pd.DataFrame(grid.cv_results_["params"]),
                          pd.DataFrame(grid.cv_results_["mean_test_score"],
                          columns=["precision"])],
                          axis=1)

grid_results

Unnamed: 0,fit_intercept,positive,precision
0,True,True,-31.06121
1,True,False,-31.06121
2,False,True,-34.282032
3,False,False,-34.291093


In [41]:
grid_results.columns

Index(['fit_intercept', 'positive', 'precision'], dtype='object')

In [42]:
grid_contour = grid_results.groupby([  'max_features', 'min_samples_leaf',
       'min_samples_split', 'n_estimators']).mean()
grid_contour

Unnamed: 0,fit_intercept,positive,precision
max_features,1.0,1.0,-31.06121
min_samples_leaf,1.0,0.0,-31.06121
min_samples_split,0.0,1.0,-34.282032
n_estimators,0.0,0.0,-34.291093
