


# **> Model Development**


Compare models based on accuracy and performance metrics like RMSE (Root Mean Squared Error) and MAE (Mean Absolute Error).

# **Mount Google Drive**
Mount Google Drive to access files stored in 'MyDrive'

Use force_remount=True to ensure the drive is remounted if previously mounted

In [None]:

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import os

# List the files in MyDrive to check if the cleaned data file exists
print(os.listdir('/content/drive/MyDrive'))


Mounted at /content/drive
['Getting started.pdf', 'Unlock Indusoft.rar', 'Wonderware InduSoft Web Studio v8.0 KegGen.rar', 'InduSoft.Web.Studio.v8.0.patch.3.x64.Crack.Only_pd.rar', 'New recording 1.m4a', 'Instructions for Application Form Urdu.pdf', 'ASF Registration Form & Medical, Physical Test Slips (BPS 01-15).pdf', 'Classroom', 'Capture.PNG', 'WhatsApp Image 2021-02-01 at 22.38.28.jpeg', 'Bramd Hunting (2).xlsx', 'Untitled spreadsheet (16).gsheet', 'Bramd Hunting (1).xlsx', 'Bramd Hunting (1).gsheet', 'Bramd Hunting.gsheet', 'Bramd Hunting.xlsx', 'UAE working', 'brand hunting.xlsx', 'product hunting.xlsx', 'Product hunting.xlsx', 'Keepa_ASIN_Export.2021_08_17.2_products.xlsx', 'Keepa_ASIN_Export.2021_08_17.100_products.xlsx', 'Product_Finder.2021_08_17 (2).products.csv', 'Product_Finder.2021_08_17.products (1) (4).csv', 'Product_Finder.2021_08_17.products (1) (3).csv', 'Product_Finder.2021_08_17.products (1) (2).csv', 'Product_Finder.2021_08_17.products (1) (1).csv', 'Product_Find

# **Import Libraries**

In [None]:
# Import the necessary libraries for data processing, modeling, and evaluation
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
import matplotlib.pyplot as plt

# **Load the cleaned data from Google Drive**

In [None]:
import pandas as pd
# Load the cleaned data from Google Drive
file_path = '/content/drive/MyDrive/car_price_cleaned_data.csv'
cleaned_data = pd.read_csv(file_path)

# Display the first few rows of the cleaned data to confirm it's loaded correctly
print(cleaned_data.head())

import pickle

# Save df_cleaned to a pickle file
with open('/content/drive/MyDrive/df_cleaned.pkl', 'wb') as f:
    pickle.dump(cleaned_data, f)

print("cleaned_data saved to Google Drive as pickle file.")


   it      ft         bt        km transmission  ownerNo      oem  \
0   0  Petrol  Hatchback  120000.0       Manual        3   Maruti   
1   0  Petrol        SUV   32706.0       Manual        2     Ford   
2   0  Petrol  Hatchback   11949.0       Manual        1     Tata   
3   0  Petrol      Sedan   17794.0       Manual        1  Hyundai   
4   0  Diesel        SUV   60000.0       Manual        1   Maruti   

                model  modelYear  centralVariantId               variantName  \
0      Maruti Celerio       2015              3979                       VXI   
1       Ford Ecosport       2018              6087  1.5 Petrol Titanium BSIV   
2          Tata Tiago       2018              2983           1.2 Revotron XZ   
3       Hyundai Xcent       2014              1867        1.2 Kappa S Option   
4  Maruti SX4 S Cross       2015              4277             DDiS 200 Zeta   

      price  mileage  Seats  
0  400000.0    23.10    5.0  
1  811000.0    17.00    5.0  
2  585000.0   

# **Apply One-Hot Encoding**

In [None]:
# Apply One-Hot Encoding for 'oem' (car brand), 'fuelType', 'carType', etc.
cleaned_data = pd.get_dummies(cleaned_data, columns=['oem', 'variantName','model','ft', 'bt', 'transmission'], drop_first=True)

# Verify the encoding
print(cleaned_data.head())



   it        km  ownerNo  modelYear  centralVariantId     price  mileage  \
0   0  120000.0        3       2015              3979  400000.0    23.10   
1   0   32706.0        2       2018              6087  811000.0    17.00   
2   0   11949.0        1       2018              2983  585000.0    23.84   
3   0   17794.0        1       2014              1867  462000.0    19.10   
4   0   60000.0        1       2015              4277  790000.0    23.65   

   Seats  oem_BMW  oem_Chevrolet  ...  ft_Petrol  bt_Coupe  bt_Hatchback  \
0    5.0    False          False  ...       True     False          True   
1    5.0    False          False  ...       True     False         False   
2    5.0    False          False  ...       True     False          True   
3    5.0    False          False  ...       True     False         False   
4    5.0    False          False  ...      False     False         False   

   bt_MUV  bt_Minivans  bt_Pickup Trucks  bt_SUV  bt_Sedan  bt_Wagon  \
0   False     

# **Prepare Data for Training**

In [None]:
from sklearn.model_selection import train_test_split

# Separate features (X) and target variable (y)
X = df_cleaned.drop(columns=['price'])  # All features except 'price'
y = df_cleaned['price']  # Target variable (price)

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")


NameError: name 'df_cleaned' is not defined

# **Initialize the models**

In [None]:
# Initialize the models
linear_regression_model = LinearRegression()
random_forest_model = RandomForestRegressor(n_estimators=100, random_state=42)
gradient_boosting_model = GradientBoostingRegressor(n_estimators=100, random_state=42)


# **Train the models**

In [None]:
# Train the models
linear_regression_model.fit(X_train, y_train)
random_forest_model.fit(X_train, y_train)
gradient_boosting_model.fit(X_train, y_train)


NameError: name 'X_train' is not defined

# **Make predictions on the test set**

In [None]:
# Make predictions on the test set
y_pred_lr = linear_regression_model.predict(X_test)
y_pred_rf = random_forest_model.predict(X_test)
y_pred_gb = gradient_boosting_model.predict(X_test)


NameError: name 'X_test' is not defined

# **Calculate RMSE and MAE for each model**

In [None]:
# Calculate RMSE and MAE for each model
def evaluate_model(y_test, y_pred):
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))  # Root Mean Squared Error
    mae = mean_absolute_error(y_test, y_pred)  # Mean Absolute Error
    return rmse, mae

# Evaluate the models
rmse_lr, mae_lr = evaluate_model(y_test, y_pred_lr)
rmse_rf, mae_rf = evaluate_model(y_test, y_pred_rf)
rmse_gb, mae_gb = evaluate_model(y_test, y_pred_gb)

# Print evaluation results for each model
print(f"Linear Regression - RMSE: {rmse_lr:.2f}, MAE: {mae_lr:.2f}")
print(f"Random Forest - RMSE: {rmse_rf:.2f}, MAE: {mae_rf:.2f}")
print(f"Gradient Boosting - RMSE: {rmse_gb:.2f}, MAE: {mae_gb:.2f}")


NameError: name 'y_test' is not defined

# **Compare model performance using bar plots for RMSE and MAE**

In [None]:
# Compare model performance using bar plots for RMSE and MAE
import matplotlib.pyplot as plt

# Prepare the data for plotting
models = ['Linear Regression', 'Random Forest', 'Gradient Boosting']
rmse_values = [rmse_lr, rmse_rf, rmse_gb]
mae_values = [mae_lr, mae_rf, mae_gb]


# Create subplots for RMSE and MAE comparison
fig, ax = plt.subplots(1, 2, figsize=(12, 6))

# RMSE plot
ax[0].bar(models, rmse_values, color='skyblue')
ax[0].set_title('Root Mean Squared Error (RMSE)')
ax[0].set_ylabel('RMSE')
ax[0].set_yscale('log')  # Apply logarithmic scale to Y-axis for better visualization

# MAE plot
ax[1].bar(models, mae_values, color='lightgreen')
ax[1].set_title('Mean Absolute Error (MAE)')
ax[1].set_ylabel('MAE')
ax[1].set_yscale('log')  # Apply logarithmic scale to Y-axis for better visualization

plt.tight_layout()
plt.show()


NameError: name 'rmse_lr' is not defined