# 1. Load Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


# 2. Load and Explore the Data

In [2]:
# Load the CSV file
file_path = "4.csv"  # Replace with the correct path to your CSV file
data = pd.read_csv(file_path)

# Print the data
print(data)

# Check for missing values
print("Missing Values:\n", data.isnull().sum(), "\n")

# Check data types of columns
print("Data Types:\n", data.dtypes, "\n")

# Get a summary of numerical columns
print("Summary of Numerical Columns:\n", data.describe(), "\n")

# Get a summary of object (categorical) columns
print("Summary of Categorical Columns:\n", data.describe(include=['object']), "\n")

# Count unique values in categorical columns
print("Unique Values in Categorical Columns:\n")
for col in data.select_dtypes(include=['object']).columns:
    print(f"{col}: {data[col].nunique()} unique values")

# Check for duplicates
duplicates = data.duplicated().sum()
print(f"\nNumber of Duplicate Rows: {duplicates}")


      MODELYEAR   MAKE       MODEL    VEHICLECLASS  ENGINESIZE  CYLINDERS  \
0          2014  ACURA         ILX         COMPACT         2.0          4   
1          2014  ACURA         ILX         COMPACT         2.4          4   
2          2014  ACURA  ILX HYBRID         COMPACT         1.5          4   
3          2014  ACURA     MDX 4WD     SUV - SMALL         3.5          6   
4          2014  ACURA     RDX AWD     SUV - SMALL         3.5          6   
...         ...    ...         ...             ...         ...        ...   
1062       2014  VOLVO    XC60 AWD     SUV - SMALL         3.0          6   
1063       2014  VOLVO    XC60 AWD     SUV - SMALL         3.2          6   
1064       2014  VOLVO    XC70 AWD     SUV - SMALL         3.0          6   
1065       2014  VOLVO    XC70 AWD     SUV - SMALL         3.2          6   
1066       2014  VOLVO    XC90 AWD  SUV - STANDARD         3.2          6   

     TRANSMISSION FUELTYPE  FUELCONSUMPTION_CITY  FUELCONSUMPTION_HWY  \
0 

# 3. Handle Outliers

In [3]:
# Detect and cap outliers using IQR
def cap_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
    df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])

# Columns with outliers
columns_with_outliers = ['ENGINESIZE', 'FUELCONSUMPTION_CITY', 'FUELCONSUMPTION_HWY',
                         'FUELCONSUMPTION_COMB', 'FUELCONSUMPTION_COMB_MPG', 'CO2EMISSIONS']

for col in columns_with_outliers:
    cap_outliers(data, col)


# 4. Feature Engineering

### a. Encode Categorical Features

In [4]:
# One-hot encode relevant categorical features
categorical_features = ['MAKE', 'VEHICLECLASS', 'TRANSMISSION', 'FUELTYPE']
data_encoded = pd.get_dummies(data, columns=categorical_features, drop_first=True)


### b. Drop High Cardinality/Uninformative Columns

In [5]:
# Drop 'MODEL' and 'MODELYEAR' as they are high cardinality or constant
data_encoded.drop(['MODEL', 'MODELYEAR'], axis=1, inplace=True)


### c. Scale Numerical Features


In [6]:
# Scale numerical features
numerical_features = ['ENGINESIZE', 'CYLINDERS', 'FUELCONSUMPTION_CITY', 
                      'FUELCONSUMPTION_HWY', 'FUELCONSUMPTION_COMB', 'FUELCONSUMPTION_COMB_MPG']

scaler = StandardScaler()
data_encoded[numerical_features] = scaler.fit_transform(data_encoded[numerical_features])


### d. Handle Skewness in Target Variable

In [7]:
# Check for skewness and apply log transformation if needed
if data_encoded['CO2EMISSIONS'].skew() > 1:
    data_encoded['CO2EMISSIONS'] = np.log1p(data_encoded['CO2EMISSIONS'])


# 5. Train-Test Split

In [8]:
# Split data into features and target
X = data_encoded.drop('CO2EMISSIONS', axis=1)
y = data_encoded['CO2EMISSIONS']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")


Training set shape: (853, 83)
Test set shape: (214, 83)


# 6. Model Training and Evaluation
### a. Define Model Evaluation Function

In [9]:
# Function to train and evaluate models
def evaluate_model(model, X_train, y_train, X_test, y_test):
    # Train the model
    model.fit(X_train, y_train)
    # Predict on training and testing sets
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Calculate metrics
    metrics = {
        'Model': type(model).__name__,
        'Train R²': r2_score(y_train, y_train_pred),
        'Test R²': r2_score(y_test, y_test_pred),
        'Train MSE': mean_squared_error(y_train, y_train_pred),
        'Test MSE': mean_squared_error(y_test, y_test_pred),
        'Train MAE': mean_absolute_error(y_train, y_train_pred),
        'Test MAE': mean_absolute_error(y_test, y_test_pred),
    }
    
    return metrics


### b. Train Models and Collect Results

In [10]:
# Initialize models
models = [
    LinearRegression(),
    Ridge(alpha=1.0),
    Lasso(alpha=0.01),
    RandomForestRegressor(n_estimators=100, random_state=42),
    GradientBoostingRegressor(n_estimators=100, random_state=42)
]

# Evaluate each model
results = []
for model in models:
    result = evaluate_model(model, X_train, y_train, X_test, y_test)
    results.append(result)

# Convert results to DataFrame for better readability
results_df = pd.DataFrame(results)
results_df.sort_values(by='Test R²', ascending=False, inplace=True)
print(results_df)


                       Model  Train R²   Test R²  Train MSE   Test MSE  \
4  GradientBoostingRegressor  0.997598  0.995506   9.444736  18.502193   
0           LinearRegression  0.993613  0.994235  25.110448  23.739139   
2                      Lasso  0.993241  0.994060  26.572738  24.459644   
1                      Ridge  0.992621  0.993622  29.009580  26.263313   
3      RandomForestRegressor  0.998759  0.989975   4.876763  41.279868   

   Train MAE  Test MAE  
4   1.969064  2.823175  
0   2.740565  2.767126  
2   2.552233  2.437421  
1   3.046807  2.879904  
3   0.620320  1.953137  


# CONCLUSION:

## **Model Performance Comparison and Conclusion**

The table below summarizes the performance of various regression models based on key evaluation metrics such as R², Mean Squared Error (MSE), and Mean Absolute Error (MAE) for both training and testing datasets.

| **Model**                  | **Train R²** | **Test R²** | **Train MSE** | **Test MSE** | **Train MAE** | **Test MAE** |
|----------------------------|--------------|-------------|---------------|--------------|---------------|--------------|
| GradientBoostingRegressor  | 0.9976       | 0.9955      | 9.4447        | 18.5022      | 1.9691        | 2.8232       |
| LinearRegression           | 0.9936       | 0.9942      | 25.1104       | 23.7391      | 2.7406        | 2.7671       |
| Lasso                      | 0.9932       | 0.9941      | 26.5727       | 24.4596      | 2.5522        | 2.4374       |
| Ridge                      | 0.9926       | 0.9936      | 29.0096       | 26.2633      | 3.0468        | 2.8799       |
| RandomForestRegressor      | 0.9988       | 0.9900      | 4.8768        | 41.2799      | 0.6203        | 1.9531       |

### **Key Observations:**
1. **GradientBoostingRegressor** achieved the best performance overall, with a **Test R² of 0.9955** and the lowest **Test MSE of 18.50**, while maintaining a reasonable **Train MAE of 1.97** and **Test MAE of 2.82**. This model is well-balanced in terms of training and testing performance, indicating minimal overfitting.
   
2. **LinearRegression** and **Lasso** also performed well, achieving **Test R² values of 0.9942** and **0.9941**, respectively, but had slightly higher **Test MSE** and **MAE** compared to GradientBoostingRegressor.

3. **RandomForestRegressor**, despite having the lowest **Train MSE of 4.88** and **Train MAE of 0.62**, underperformed on the test set with a relatively lower **Test R² of 0.9900** and a significantly higher **Test MSE of 41.28**, indicating possible overfitting.

4. **Ridge Regression**, while stable, lagged slightly behind the other models in terms of performance metrics.

### **Conclusion:**
Based on the evaluation metrics, **GradientBoostingRegressor** is the most suitable model for this problem as it provides the best balance between training and testing performance. It is recommended to use this model for predicting **CO2EMISSIONS** in production.
