In [8]:
#Required libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score



In [9]:
#Procedure 
#Step 1: import required libraries
#Step 2: Load dataset
#Step 3: Rename the blank column to 'model'
#Step 4: Display first few rows of the dataframe
#Step 5: Define target variable and features
#Step 6: Split dataset into training and testing sets
#Step 7: Evaluate model using cross-validation
#Step 8: Calculate and display the mean cross-validation score by evaluating different feature combinations
#Step 9: Build final model using the best feature combination identified through cross-validation
#Step 10: Use Polynomial regression to capture non-linear relationships(just do this for one feature combination as an example)

#This procedure can be used to validate different datasets and models by following the same steps.

#Step 2:
# Load dataset
dfcars = pd.read_csv('mtcars.csv')

#Step 3:
#Rename the blank column to 'model'
dfcars = dfcars.rename(columns={'Unnamed: 0': 'model'})

#Step 4:
# Display first few rows of the dataframe
dfcars.head()


Unnamed: 0,model,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


In [10]:
#Step 5:
#Define target variable and features

# Now, we know that 'mpg' is the target variable we want to predict based on other features.
y = dfcars['mpg']

# We want to explore the relationship between 'mpg' and the other features in the dataset.
# Lets start by dropping the first two columns to give our features a cleaner look
allX = dfcars.drop(columns=['model', 'mpg'])
allX.head()


Unnamed: 0,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,8,360.0,175,3.15,3.44,17.02,0,0,3,2


In [11]:
#Step 6:
#Lets split our dataset into a training and a testing set
#We will use 80% of the data for training and 20% for testing
#We have set a random state for reproducibility

allX_train, allX_test, y_train, y_test = train_test_split(allX, y, test_size=0.2, random_state=42)

## Validation.

Okay, so below we will look at using validation.

In [12]:
#Step 7:
#Evaluate model using cross-validation
# Now, we will evaluate our linear regression model using cross-validation.
#Sklearn cross_val_score function uses K-fold cross validation by default.

# In particular, we will look at using cross-validation to evaluate our model's performance more robustly.
# We will use 5-fold cross-validation for this purpose.
from sklearn.model_selection import cross_val_score

#Firstly lets use the 'wt' feature to predict 'mpg'

X_train = allX_train[['wt']]
X_test = allX_test[['wt']]

# Create a linear regression model
model = LinearRegression()
scores = cross_val_score(model, X_train, y_train)
scores


array([0.10736385, 0.49488149, 0.86879651, 0.65179584, 0.73272638])

In [13]:
# Calculate and display the mean cross-validation score
scores.mean()

np.float64(0.5711128154689776)

In [14]:
# Now lets try a couple of features to predict 'mpg'
X_train = allX_train[['wt', 'hp']]
X_test = allX_test[['wt', 'hp']]

# Create a linear regression model
model2 = LinearRegression()
scores = cross_val_score(model2, X_train, y_train)
scores


array([0.1262545 , 0.6744817 , 0.76712479, 0.76448681, 0.83404541])

In [15]:
scores.mean()

np.float64(0.6332786406507414)

In [16]:
# How about using three features: 'wt', 'hp', and 'carb' to predict 'mpg'?
X_train = allX_train[['wt', 'hp', 'carb']]
X_test = allX_test[['wt', 'hp', 'carb']]

# Create a linear regression model
model3 = LinearRegression()
scores = cross_val_score(model3, X_train, y_train)
scores

array([0.07601333, 0.67448205, 0.74864389, 0.70817736, 0.81098304])

In [17]:
scores.mean()

np.float64(0.6036599344425266)

In [18]:
# How about using 'wt', 'hp' and 'drat'
X_train = allX_train[['wt', 'hp', 'drat']]
X_test = allX_test[['wt', 'hp', 'drat']]

# Create a linear regression model
model4 = LinearRegression()
scores = cross_val_score(model4, X_train, y_train)
scores.mean()

np.float64(0.6571279909053492)

### About validation

From above it can be seen that by adding more relevant features, the model's performance improves as indicated by the 
increasing mean cross-validation scores. This suggests that 'wt', 'hp', and 'drat' are important predictors for 'mpg' 
in this dataset.
Validation tests confirm that incorporating multiple relevant features enhances the predictive accuracy of the 
linear regression model for 'mpg'.
Validation tests involve assessing the model's performance using cross-validation scores across different feature combinations.
The training data is split into multiple folds, and the model is trained and evaluated on these folds to ensure 
robustness and generalizability.
This process helps in identifying the best feature combinations for predicting 'mpg' while minimizing overfitting.

Our best so far (according to the cross val scores) has been model7, the one with ["wt","hp","drat"]. So let's build that model fully and then evaluate that

In [19]:
#Step 9:

# Build final model using the best feature combination identified through cross-validation
X_train = allX_train[['wt', 'hp', 'drat']]
X_test = allX_test[['wt', 'hp', 'drat']]

model_final = LinearRegression()
model_final.fit(X_train, y_train)
y_pred = model_final.predict(X_test)
model_final.score(X_test, y_test)


0.79004928438052

### Polynomial regression 


In [20]:
#Step 10:
### Polynomial regression
#We will use polynomial regression to capture non-linear relationships, if any, between features and target variable.

from sklearn.preprocessing import PolynomialFeatures
#And train_test_split
from sklearn.model_selection import train_test_split

In [None]:
#Lets use 'wt' to predict 'mpg' using polynomial regression
#Firstly we will split the training set into a training and validation set

#Select our training and testing features
X_train = allX_train[['wt']]
X_test = allX_test[['wt']]

#Split our training into training and validation sets
X_v_train, X_v_valid, y_v_train, y_v_valid = train_test_split(X_train, y_train)

In [24]:
X_v_train.shape

(18, 1)

In [None]:
#Now we will create polynomial feature from degree 0 to degree 9 
max_p = 10
degrees = range(max_p+1)
print(len(degrees))
error_train=np.empty(len(degrees))
error_valid=np.empty(len(degrees))
score_train=np.empty(len(degrees))
score_valid=np.empty(len(degrees))
#for each degree, we now fit on the smaller training set and predict on the validation set
#we accumulate the MSE on both sets in error_train and error_valid
#we then find the degree of polynomial that minimizes the MSE on the validation set.
#your code here
for d in degrees:#for increasing polynomial degrees 0,1,2...
    #Create polynomials from X_v_train and X_v_valid
    X_c = PolynomialFeatures(d).fit_transform(X_v_train)
    X_c_val = PolynomialFeatures(d).fit_transform(X_v_valid)
    #fit a model linear in polynomial coefficients on the new smaller training set
    est = LinearRegression()
    est.fit(X_c, y_v_train)    
    #predict on new training and validation sets and calculate mean squared error
    error_train[d] = mean_squared_error(est.predict(X_c), y_v_train)
    error_valid[d] = mean_squared_error(est.predict(X_c_val), y_v_valid)
    score_train[d] = est.score(X_c, y_v_train)
    score_valid[d] = est.score(X_c_val, y_v_valid)
    

11


In [27]:
score_train

array([0.        , 0.77779438, 0.86649409, 0.87845174, 0.89382362,
       0.90147355, 0.90230484, 0.90497915, 0.91429584, 0.91675412,
       0.91878451])