## Model evaluation

In [3]:
import os
!pip install seaborn   ## install seaborn package 
import seaborn as sns  ## import seaborn package
import matplotlib.pyplot as plt ## import matplotlib package
# Set the working directory
os.chdir('/Users/giorgiocavallo/Desktop/Python/coursera/IBM_data_analysis')

# Get and print the new current working directory
current_directory = os.getcwd()
print(f"Current working directory: {current_directory}")  ## f is used to format the string


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Current working directory: /Users/giorgiocavallo/Desktop/Python/coursera/IBM_data_analysis


In [4]:
## read the csv data
import pandas as pd
df = pd.read_csv('automobile.csv')
print(df.head())

   symboling  normalized-losses         make fuel-type aspiration  \
0          3                NaN  alfa-romero       gas        std   
1          3                NaN  alfa-romero       gas        std   
2          1                NaN  alfa-romero       gas        std   
3          2              164.0         audi       gas        std   
4          2              164.0         audi       gas        std   

  num-of-doors   body-style drive-wheels engine-location  wheel-base  ...  \
0          two  convertible          rwd           front        88.6  ...   
1          two  convertible          rwd           front        88.6  ...   
2          two    hatchback          rwd           front        94.5  ...   
3         four        sedan          fwd           front        99.8  ...   
4         four        sedan          4wd           front        99.4  ...   

   engine-size  fuel-system  bore  stroke compression-ratio horsepower  \
0          130         mpfi  3.47    2.68       

In [5]:
# In sample evaluation: In-sample evaluation is the process of evaluating the model using the same data that was used to train the model.
# Out of sample evaluation: Out-of-sample evaluation is the process of evaluating the model using a different data set than the data set used to train the model.
from sklearn.model_selection import train_test_split ## import train_test_split function from sklearn.model_selection
# Split the data into training and testing sets
# The test_size parameter sets the proportion of data that is split into the testing set. In this example, 0.3 indicates that 30% of the data is used for testing.
# The random_state parameter sets a seed for the random number generator, which allows for reproducibility.
# what should I assign to x an y here? to x all variables except price and to y price
x_data = df.drop('price', axis=1) ## assign all the variables except price to X, 1 means column and 0 means row. 
y_data = df['price'] ## assign the price column to y

X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=0) ## split the data into training and testing sets

## x_data: features or independent variables
## y_data: dataset target or dependent variable df['price']
## X_train: training set features


In [6]:
## Generalization performance: Generalization performance is the ability of a model to perform well on new, unseen data.
## Overfitting: Overfitting occurs when a model learns the detail and noise in the training data to the extent that it 
## negatively impacts the performance of the model on new data.
## Generalization error:Generalization error refers to the difference in performance between a machine learning model on the training data and 
## its performance on unseen data (test/validation data). It measures how well the model can generalize its learned patterns to new, unseen examples.


In [7]:
## cross validation: Cross-validation is a resampling procedure used to evaluate machine learning models on a limited data sample.
## The procedure has a single parameter called k that refers to the number of groups that a given data sample is to be split into.
## As such, the procedure is often called k-fold cross-validation. When a specific value for k is chosen, it is used to split the data into k groups.
## Each group is called a fold. The model is trained on k-1 of the folds and evaluated on the kth fold.
  

In [8]:
# Import required libraries
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

# Step 1: Remove all rows with missing values from the original dataframe
df_clean = df.dropna()  # This removes any row that has at least one missing value

# Step 2: Create new x_data and y_data from cleaned dataset
x_data = df_clean.drop('price', axis=1)   ## 1 is column and 0 is row
y_data = df_clean['price']

# Step 3: Separate numeric and categorical columns
# select_dtypes helps us filter columns by their data type
numeric_features = x_data.select_dtypes(include=['int64', 'float64'])    # Get all numeric columns
categorical_features = x_data.select_dtypes(include=['object'])          # Get all string/object columns

# Step 4: Create dummy variables using pandas get_dummies
# drop_first=True removes one category to avoid perfect multicollinearity
# This transforms categorical variables into binary (0/1) columns
dummy_variables = pd.get_dummies(categorical_features, drop_first=True)

# Step 5: Combine numeric features with dummy variables
# axis=1 means we're concatenating horizontally (adding columns)
x_data_encoded = pd.concat([numeric_features, dummy_variables], axis=1) ## is like cbind in R  


# Step 6: Perform cross-validation with the encoded features
# LinearRegression(): Creates a new linear regression model
# x_data_encoded: Our features including both numeric and encoded categorical variables
# y_data: Our target variable (price)
# cv=3: Splits the data into 3 folds for cross-validation
scores = cross_val_score(LinearRegression(), x_data_encoded, y_data, cv=3)

# Print the results
print("Shape of data before cleaning:", df.shape)
print("Shape of data after cleaning:", df_clean.shape)
print("Number of rows removed:", df.shape[0] - df_clean.shape[0])
print("\nCross-validation scores:", scores)           # Shows score for each fold
print("Average CV score:", scores.mean())           # Shows average performance
print("Number of features:", x_data_encoded.shape[1])  # Shows total number of features after encoding



Shape of data before cleaning: (205, 26)
Shape of data after cleaning: (159, 26)
Number of rows removed: 46

Cross-validation scores: [ 0.7992442  -0.12911674 -0.69016955]
Average CV score: -0.006680699095501767
Number of features: 54


In [9]:
## Function cross_val_predict() returns the predicted values for each data point when it's in the testing set.
## The function takes the following parameters:
## model: The machine learning model
## x_data: The features
## y_data: The target variable
## cv: The number of folds
## The function returns an array of predicted values for each data point when it's in the testing set.
## Import the required libraries

from sklearn.model_selection import cross_val_predict  ## from sklearn.model_selection import cross_val_predict
yhat = cross_val_predict(LinearRegression(), x_data_encoded, y_data, cv=3) ## cross_val_predict() function returns the predicted values for each data
## point when it's in the testing set.

print(yhat) ## print the predicted values for each data point when it's in the testing set.



[10977.47449519 14792.70805292 17190.59779579 19763.86876839
 11532.73702454 12163.46063011 13933.56402982 14753.51473473
  3729.32269908  7559.81751971  8087.43955752  8088.1833249
  8904.79706316 13165.84393595 10123.21329967 10479.58143707
 10479.58143707 10728.41091986 15928.3686113   7595.31947877
  9621.34692832  8944.98562839 10293.6158748  10359.59457109
  9815.15834779  9629.24637504 10506.34541796 10724.89984939
 10200.47586722  8556.57349846 10198.74985578 11166.03766625
 25064.74021176  7424.05169538  7910.26697121  7930.8853138
  7978.07011892  7897.58856767 10412.58002724 10731.33607458
 10412.58002724 10731.33607458 10527.54367234 15171.77555123
 21359.37382182 22041.29151217 20517.18216041 23310.1193315
 27558.02534887  5828.89521227  6747.05954629  6994.47965735
  8787.17473353  9250.94585218  8725.01840436  8743.45718323
  8947.06872021  9249.02347736  6332.1084937  10054.92859534
  7323.62228815 10202.54695965 11142.33538028 11894.87617943
 10370.52647766 11131.43348

## Overfitting, underfitting and model selection


In [10]:
# Import required libraries for polynomial regression and data splitting
from sklearn.preprocessing import PolynomialFeatures  # For creating polynomial features
from sklearn.linear_model import LinearRegression    # For linear regression model
from sklearn.model_selection import train_test_split # For splitting data into train/test sets

# Clean data by removing rows with missing values
df_clean = df.dropna()

## Separate features (X) and target variable (y)
## Extract numeric features excluding the price column
numeric_features = df_clean.select_dtypes(include=['int64', 'float64']).drop('price', axis=1)
# Extract categorical features
categorical_features = df_clean.select_dtypes(include=['object'])

# Convert categorical variables into binary (dummy) variables
# drop_first=True removes one category to avoid multicollinearity
dummy_variables = pd.get_dummies(categorical_features, drop_first=True)

# Combine numeric and dummy variables into final feature matrix
X = pd.concat([numeric_features, dummy_variables], axis=1)
y = df_clean['price']  # Target variable (car price)

# Split data into training (70%) and testing (30%) sets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Initialize empty list to store R-squared values for each polynomial degree
Rsqu_test = []

# Test different polynomial degrees (1 through 4)
order = [1, 2, 3, 4]
for n in order:
    # Create polynomial features of degree n
    pr = PolynomialFeatures(degree=n)
    # Transform training data into polynomial features
    x_train_pr = pr.fit_transform(x_train)
    # Transform test data using the same polynomial features
    x_test_pr = pr.transform(x_test)
    
    # Create and train linear regression model
    lr = LinearRegression()
    lr.fit(x_train_pr, y_train)
    
    # Calculate and store R-squared score on test set
    r2_score = lr.score(x_test_pr, y_test)
    Rsqu_test.append(r2_score)
    
    # Print R-squared score for current polynomial degree
    print(f"Polynomial degree {n}: R-squared = {r2_score:.4f}")

# Print final summary of R-squared values for all polynomial degrees
print("\nR-squared values for each polynomial degree:")
for degree, r2 in zip(order, Rsqu_test):
    print(f"Degree {degree}: {r2:.4f}")

Polynomial degree 1: R-squared = 0.9162
Polynomial degree 2: R-squared = -8.6131
Polynomial degree 3: R-squared = -22.7896
Polynomial degree 4: R-squared = -17.8442

R-squared values for each polynomial degree:
Degree 1: 0.9162
Degree 2: -8.6131
Degree 3: -22.7896
Degree 4: -17.8442
