In [54]:
import os.path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [55]:
import warnings
warnings.simplefilter('ignore')

In [56]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [57]:
print(os.path.exists("../data/study_performance.csv"))

True


In [58]:
ds = pd.read_csv("../data/study_performance.csv")

In [59]:
print('columns count - ',len(ds.columns), '\n')
print('columns: ',list(ds.columns))

columns count -  8 

columns:  ['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course', 'math_score', 'reading_score', 'writing_score']


## Data engineering

In [60]:
# Numerical data
numeric_columns = ds.select_dtypes(include=['float64', 'int64']).columns
for col in numeric_columns:
    if ds[col].isnull().any():
        median_value = ds[col].mode()[0]
        ds[col].fillna(median_value, inplace=True)

In [61]:
ds.isnull().sum().sum()

0

In [62]:
# Categorical
categorical_columns = ds.select_dtypes(include=['object']).columns

map_dicts = dict()
for column in categorical_columns:
    ds[column] = ds[column].astype('category')
    map_dicts[column] = dict(zip(ds[column], ds[column].cat.codes))
    ds[column] = ds[column].cat.codes

In [63]:
ds.isnull().sum().sum()

0

# Modelling and Validation

#### Define target and features columns

In [64]:
y_column = ['math_score'] # target variable
X_columns = ['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course', 'reading_score', 'writing_score']
X = ds[X_columns]
y = ds[y_column]

### Definition of Train-Valid-Test Split

In [65]:
# Let's say we want to split the data in 80:10:10 for train:valid:test dataset
# In the first step we will split the data in training and remaining dataset
X_train, X_rem, y_train, y_rem = train_test_split(X,y, train_size=0.8)

# Now since we want the valid and test size to be equal (10% each of overall data).
# we have to define valid_size=0.5 (that is 50% of remaining data)
X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5)

print(X_train.shape)
print(y_train.shape)
print(X_valid.shape)
print(y_valid.shape)
print(X_test.shape)
print(y_test.shape)

(800, 7)
(800, 1)
(100, 7)
(100, 1)
(100, 7)
(100, 1)


In [69]:
models = { 'LinearRegression' : LinearRegression() }

In [70]:
regressors = dict()
for name, model in models.items():
    print('training ',name)
    regressor = model
    regressor.fit(X_train, y_train)
    y_pred = regressor.predict(X_test)
    print(metrics.mean_absolute_error(y_test, y_pred))
    print(metrics.mean_absolute_percentage_error(y_test, y_pred))
    regressors[name] = regressor


training  LinearRegression
4.050674890416868
224884019630211.88


# Random permutations cross-validation

In [73]:
from sklearn.model_selection import ShuffleSplit

ss = ShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
for train_index, test_index in ss.split(X):
    regressor = LinearRegression()
    regressor.fit(X.iloc[train_index], y.iloc[train_index])
    y_pred = regressor.predict(X.iloc[test_index])
    
    # Evaluate regression metrics
    mae = metrics.mean_absolute_error(y.iloc[test_index], y_pred)
    mse = metrics.mean_squared_error(y.iloc[test_index], y_pred)
    r2 = metrics.r2_score(y.iloc[test_index], y_pred)
    
    print('Mean Absolute Error:', mae)
    print('Mean Squared Error:', mse)
    print('R-squared:', r2)


Mean Absolute Error: 4.402780807752258
Mean Squared Error: 31.318669059894845
R-squared: 0.8617897267279011
