In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
df = pd.read_csv('/content/car_fuel_efficiency.csv')

In [6]:
df

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.870990,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369
...,...,...,...,...,...,...,...,...,...,...,...
9699,140,5.0,164.0,2981.107371,17.3,2013,Europe,Diesel,Front-wheel drive,,15.101802
9700,180,,154.0,2439.525729,15.0,2004,USA,Gasoline,All-wheel drive,0.0,17.962326
9701,220,2.0,138.0,2583.471318,15.1,2008,USA,Diesel,All-wheel drive,-1.0,17.186587
9702,230,4.0,177.0,2905.527390,19.4,2011,USA,Diesel,Front-wheel drive,1.0,15.331551


In [7]:
df.isnull().sum()

Unnamed: 0,0
engine_displacement,0
num_cylinders,482
horsepower,708
vehicle_weight,0
acceleration,930
model_year,0
origin,0
fuel_type,0
drivetrain,0
num_doors,502


In [8]:
numeric_df = df.select_dtypes(include=np.number)
numeric_df.median(axis=1)

Unnamed: 0,0
0,88.350000
1,57.400000
2,78.000000
3,20.200000
4,77.200000
...,...
9699,140.000000
9700,154.000000
9701,77.593293
9702,98.200000


In [9]:
median_horsepower = df['horsepower'].median()
print(f"The median horsepower is: {median_horsepower}")

The median horsepower is: 149.0


In [18]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np

# Define features (X) and target (y)
X = df.drop('fuel_efficiency_mpg', axis=1)
y = df['fuel_efficiency_mpg']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify categorical and numerical columns
categorical_cols = ['origin', 'fuel_type', 'drivetrain']
numerical_cols = X.select_dtypes(include=np.number).columns.tolist()
# Remove the target variable from numerical columns if it exists
if 'fuel_efficiency_mpg' in numerical_cols:
    numerical_cols.remove('fuel_efficiency_mpg')


# Create preprocessing pipelines for numerical and categorical features
numerical_transformer = SimpleImputer(strategy='median') # Impute missing numerical values with the median
categorical_transformer = OneHotEncoder(handle_unknown='ignore') # One-hot encode categorical features

# Create a column transformer to apply different transformations to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)])

# Apply preprocessing to the training and validation sets
X_train_processed = preprocessor.fit_transform(X_train)
X_val_processed = preprocessor.transform(X_val)

print("Preprocessing applied successfully.")

Preprocessing applied successfully.


In [10]:
from sklearn.model_selection import train_test_split

X = df.drop('fuel_efficiency_mpg', axis=1)
y = df['fuel_efficiency_mpg']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
X_train['horsepower'] = X_train['horsepower'].fillna(0)
X_val['horsepower'] = X_val['horsepower'].fillna(0)

In [26]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

model_0_imputation = LinearRegression()
model_0_imputation.fit(X_train_processed, y_train)
y_pred_0_imputation = model_0_imputation.predict(X_val_processed)

rmse_0_imputation = np.sqrt(mean_squared_error(y_val, y_pred_0_imputation))
print(f"RMSE with 0 imputation: {rmse_0_imputation:.2f}")

RMSE with 0 imputation: 0.40


In [29]:
model_0_imputation = LinearRegression()
model_0_imputation.fit(X_train_processed, y_train)
y_pred_0_imputation = model_0_imputation.predict(X_val_processed)

rmse_0_imputation = np.sqrt(mean_squared_error(y_val, y_pred_0_imputation))
print(f"RMSE with 0 imputation: {rmse_0_imputation:.2f}")

RMSE with 0 imputation: 0.40


In [14]:
for col in ['num_cylinders', 'acceleration', 'num_doors']:
    median_val = X_train[col].median()
    X_train[col] = X_train[col].fillna(median_val)
    X_val[col] = X_val[col].fillna(median_val)

model_0_imputation = LinearRegression()
model_0_imputation.fit(X_train, y_train)
y_pred_0_imputation = model_0_imputation.predict(X_val)

rmse_0_imputation = np.sqrt(mean_squared_error(y_val, y_pred_0_imputation))
print(f"RMSE with 0 imputation: {rmse_0_imputation:.2f}")

RMSE with 0 imputation: 0.47


In [15]:
mean_horsepower = X_train['horsepower'].mean()
X_train['horsepower'] = X_train['horsepower'].fillna(mean_horsepower)
X_val['horsepower'] = X_val['horsepower'].fillna(mean_horsepower)

In [16]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

model_mean_imputation = LinearRegression()
model_mean_imputation.fit(X_train, y_train)
y_pred_mean_imputation = model_mean_imputation.predict(X_val)

rmse_mean_imputation = np.sqrt(mean_squared_error(y_val, y_pred_mean_imputation))
print(f"RMSE with mean imputation: {rmse_mean_imputation:.2f}")

RMSE with mean imputation: 0.47


In [30]:
print(f"RMSE with 0 imputation: {rmse_0_imputation:.2f}")
print(f"RMSE with mean imputation: {rmse_mean_imputation:.2f}")

if rmse_0_imputation < rmse_mean_imputation:
    print("Imputation with 0 resulted in a lower RMSE.")
elif rmse_mean_imputation < rmse_0_imputation:
    print("Imputation with the mean resulted in a lower RMSE.")
else:
    print("Both imputation methods resulted in the same RMSE.")

RMSE with 0 imputation: 0.40
RMSE with mean imputation: 0.47
Imputation with 0 resulted in a lower RMSE.


In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Assuming 'df' is your pre-loaded DataFrame
# df = pd.read_csv('/content/car_fuel_efficiency.csv')

# Define features (X) and target (y)
X = df.drop('fuel_efficiency_mpg', axis=1)
y = df['fuel_efficiency_mpg']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=np.number).columns.tolist()

# Create a preprocessing pipeline
# Use SimpleImputer to fill NAs with 0 for numerical columns
numerical_transformer = SimpleImputer(strategy='constant', fill_value=0)
# Use OneHotEncoder for categorical columns
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Create a column transformer to apply different transformations to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)])

# List of r values to test
r_values = [0, 0.01, 0.1, 1, 5, 10, 100]
rmse_results = {}

# Loop through each r value to find the best one
for r in r_values:
    # Define the model with the current r value
    model = Ridge(alpha=r, random_state=42)

    # Create the full pipeline with preprocessing and the model
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                             ('regressor', model)])

    # Train the model
    pipeline.fit(X_train, y_train)

    # Predict on the validation set
    y_pred = pipeline.predict(X_val)

    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmse_results[r] = round(rmse, 2)

for r, rmse in rmse_results.items():
    print(f"For r = {r}, RMSE = {rmse}")

For r = 0, RMSE = 0.52
For r = 0.01, RMSE = 0.51
For r = 0.1, RMSE = 0.51
For r = 1, RMSE = 0.51
For r = 5, RMSE = 0.51
For r = 10, RMSE = 0.51
For r = 100, RMSE = 0.51


In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Assuming 'df' is your pre-loaded DataFrame
# df = pd.read_csv('/content/car_fuel_efficiency.csv')

# Define features (X) and target (y)
X = df.drop('fuel_efficiency_mpg', axis=1)
y = df['fuel_efficiency_mpg']

seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
rmse_scores = []

# Identify column types
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=np.number).columns.tolist()

# Define the preprocessing pipeline
# Numerical features: Fill missing values with 0
# Categorical features: One-hot encode
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='constant', fill_value=0), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)])

# Define the model (unregularized)
model = LinearRegression()

# Create the full pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                         ('regressor', model)])

# Iterate through each seed
for seed in seeds:
    # Split data: 60% train, 20% validation, 20% test
    # 1. Split into 80% for train/val and 20% for test
    X_train_full, X_test, y_train_full, y_test = train_test_split(
        X, y, test_size=0.2, random_state=seed
    )
    # 2. Split the 80% into train (60% of total) and validation (20% of total)
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_full, y_train_full, test_size=0.25, random_state=seed # 0.25 * 0.80 = 0.20
    )

    # Train the entire pipeline
    pipeline.fit(X_train, y_train)

    # Make predictions on the validation set
    y_pred_val = pipeline.predict(X_val)

    # Calculate RMSE and add it to our list of scores
    rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
    rmse_scores.append(rmse)

# Calculate the standard deviation of the RMSE scores
std_deviation = np.std(rmse_scores)

# Round the result to 3 decimal digits
rounded_std = round(std_deviation, 3)

print(f"Standard Deviation of RMSE scores: {rounded_std}")

Standard Deviation of RMSE scores: 0.007


In [37]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Assuming 'df' is your pre-loaded DataFrame
# df = pd.read_csv('/content/car_fuel_efficiency.csv')

# Define features (X) and target (y)
X = df.drop('fuel_efficiency_mpg', axis=1)
y = df['fuel_efficiency_mpg']

# Define the seed and regularization parameter
seed = 9
r_value = 0.001

# Split the data into a combined training/validation set (80%) and a test set (20%)
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=seed
)

# Identify column types
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=np.number).columns.tolist()

# Define the preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='constant', fill_value=0), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)])

# Define the Ridge regression model
model = Ridge(alpha=r_value, random_state=seed)

# Create the full pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                         ('regressor', model)])

# Train the model on the combined 80% of data
pipeline.fit(X_train_val, y_train_val)

# Make predictions on the test set
y_pred_test = pipeline.predict(X_test)

# Calculate the final RMSE on the test set
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))

print(f"RMSE on the test dataset is: {rmse_test:.3f}")

RMSE on the test dataset is: 0.514
