In [155]:
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
from google.colab import drive
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error, r2_score

In [156]:
drive.mount('/content/drive')
file_path = '/content/drive/My Drive/car_data.csv'
data = pd.read_csv(file_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [157]:
data.shape

(80572, 10)

In [158]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80572 entries, 0 to 80571
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Make             80572 non-null  object
 1   Model            80572 non-null  object
 2   Version          73800 non-null  object
 3   Price            80572 non-null  object
 4   Make_Year        80572 non-null  int64 
 5   CC               80572 non-null  int64 
 6   Assembly         80572 non-null  object
 7   Mileage          80572 non-null  int64 
 8   Registered City  80572 non-null  object
 9   Transmission     80572 non-null  object
dtypes: int64(3), object(7)
memory usage: 6.1+ MB


In [159]:
data['Vehicle_age'] = datetime.now().year - data['Make_Year']
data['Vehicle_age'] = data['Vehicle_age'].round().astype('int64')
data.drop(['Make_Year'], axis=1, inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80572 entries, 0 to 80571
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Make             80572 non-null  object
 1   Model            80572 non-null  object
 2   Version          73800 non-null  object
 3   Price            80572 non-null  object
 4   CC               80572 non-null  int64 
 5   Assembly         80572 non-null  object
 6   Mileage          80572 non-null  int64 
 7   Registered City  80572 non-null  object
 8   Transmission     80572 non-null  object
 9   Vehicle_age      80572 non-null  int64 
dtypes: int64(3), object(7)
memory usage: 6.1+ MB


In [160]:
data.describe()

Unnamed: 0,CC,Mileage,Vehicle_age
count,80572.0,80572.0,80572.0
mean,1404.083267,85653.66008,12.275791
std,684.458171,82241.870901,6.953399
min,1.0,1.0,3.0
25%,1000.0,36500.0,7.0
50%,1300.0,73000.0,11.0
75%,1600.0,110520.0,17.0
max,10000.0,999999.0,34.0


In [161]:
data.isnull().sum()

Make                  0
Model                 0
Version            6772
Price                 0
CC                    0
Assembly              0
Mileage               0
Registered City       0
Transmission          0
Vehicle_age           0
dtype: int64

In [162]:
data.nunique()

Make                 66
Model               389
Version            1339
Price              2229
CC                  113
Assembly              2
Mileage            8852
Registered City     187
Transmission          2
Vehicle_age          32
dtype: int64

In [163]:
TargetVariables = 'Price'
CategoricalVariables = []
ContinousVariables = []
RedundantVariables = []
PredictorVariables = []

for column in data.columns:
    if data[column].isnull().sum() > 200:
        RedundantVariables.append(column)
    else:
        if data[column].name != 'Price':
            if data[column].dtype == "object":
                CategoricalVariables.append(column)
            elif data[column].dtype == "float64":
                ContinousVariables.append(column)
            elif data[column].dtype == 'int64':
                if data[column].nunique() < 10:
                    CategoricalVariables.append(column)
                else:
                    ContinousVariables.append(column)

In [164]:
CategoricalVariables

['Make', 'Model', 'Assembly', 'Registered City', 'Transmission']

In [165]:
ContinousVariables

['CC', 'Mileage', 'Vehicle_age']

In [166]:
RedundantVariables

['Version']

In [167]:
data[TargetVariables].describe()

count              80572
unique              2229
top       Call for price
freq                1209
Name: Price, dtype: object

In [168]:
experimental_data = data[data['Price'] == 'Call for price']
experimental_data.head()

Unnamed: 0,Make,Model,Version,Price,CC,Assembly,Mileage,Registered City,Transmission,Vehicle_age
25,MG,HS,1.5 Turbo,Call for price,1500,Imported,5,Un-Registered,Automatic,3
27,MG,HS,1.5 Turbo,Call for price,1500,Local,5,Un-Registered,Automatic,3
43,Toyota,Hilux,Revo V Automatic 2.8,Call for price,2800,Local,5,Un-Registered,Automatic,3
50,Honda,Civic,Oriel 1.8 i-VTEC CVT,Call for price,1800,Local,63000,Islamabad,Automatic,6
78,Toyota,Aqua,S,Call for price,1500,Imported,45000,Un-Registered,Automatic,7


In [169]:
data = data[data['Price'] != 'Call for price']
data.drop(RedundantVariables, axis=1, inplace=True)

In [170]:
data.isnull().sum()

Make               0
Model              0
Price              0
CC                 0
Assembly           0
Mileage            0
Registered City    0
Transmission       0
Vehicle_age        0
dtype: int64

In [171]:
ordinalEncoder = OrdinalEncoder()
data[CategoricalVariables] = ordinalEncoder.fit_transform(data[CategoricalVariables])

In [172]:
data.head(5)

Unnamed: 0,Make,Model,Price,CC,Assembly,Mileage,Registered City,Transmission,Vehicle_age
0,25.0,168.0,7400000.0,1500,0.0,2000,180.0,0.0,5
1,43.0,212.0,1065000.0,660,0.0,68000,97.0,0.0,5
2,1.0,17.0,9300000.0,1800,1.0,70000,97.0,0.0,9
3,61.0,31.0,2375000.0,1500,0.0,99900,66.0,0.0,10
4,25.0,84.0,2600000.0,1300,1.0,55000,66.0,1.0,7


In [173]:
data['Price'] = pd.to_numeric(data['Price'], errors='coerce')
data['Price'] = data['Price'].round().astype('int64')

In [174]:
sample_size = 2000
sample_data = data.sample(n=sample_size, random_state=42)
features = ContinousVariables + CategoricalVariables
model = RandomForestClassifier()
model.fit(sample_data[features], sample_data[TargetVariables])
feature_importances = model.feature_importances_
feature_importance_dict = dict(zip(features, feature_importances))
sorted_feature_importances = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)
for feature, importance in sorted_feature_importances:
    print(f"Feature: {feature}, Importance: {importance}")

Feature: Mileage, Importance: 0.37111096674837835
Feature: Vehicle_age, Importance: 0.21731782981359915
Feature: Registered City, Importance: 0.14835057781040104
Feature: Model, Importance: 0.10318827029500008
Feature: CC, Importance: 0.0826260776635769
Feature: Make, Importance: 0.04189002547653954
Feature: Assembly, Importance: 0.020758697799882403
Feature: Transmission, Importance: 0.014757554392622677


In [175]:
for feature, _ in sorted_feature_importances:
    PredictorVariables.append(feature)
print("Predictor Variables:", PredictorVariables)

Predictor Variables: ['Mileage', 'Vehicle_age', 'Registered City', 'Model', 'CC', 'Make', 'Assembly', 'Transmission']


In [176]:
data[PredictorVariables].shape

(79363, 8)

In [177]:
data[TargetVariables].shape

(79363,)

In [178]:
data.shape

(79363, 9)

In [179]:
data.head(5)

Unnamed: 0,Make,Model,Price,CC,Assembly,Mileage,Registered City,Transmission,Vehicle_age
0,25.0,168.0,7400000,1500,0.0,2000,180.0,0.0,5
1,43.0,212.0,1065000,660,0.0,68000,97.0,0.0,5
2,1.0,17.0,9300000,1800,1.0,70000,97.0,0.0,9
3,61.0,31.0,2375000,1500,0.0,99900,66.0,0.0,10
4,25.0,84.0,2600000,1300,1.0,55000,66.0,1.0,7


In [180]:
scaler = MinMaxScaler(feature_range=(0, 1))
target_variable = data[TargetVariables].values.reshape(-1, 1)
scaled_target_variable = scaler.fit_transform(target_variable)
data[TargetVariables] = scaled_target_variable
data.head(10)

Unnamed: 0,Make,Model,Price,CC,Assembly,Mileage,Registered City,Transmission,Vehicle_age
0,25.0,168.0,0.076923,1500,0.0,2000,180.0,0.0,5
1,43.0,212.0,0.010169,660,0.0,68000,97.0,0.0,5
2,1.0,17.0,0.096944,1800,1.0,70000,97.0,0.0,9
3,61.0,31.0,0.023973,1500,0.0,99900,66.0,0.0,10
4,25.0,84.0,0.026344,1300,1.0,55000,66.0,1.0,7
5,14.0,160.0,0.010643,660,0.0,136000,80.0,1.0,13
6,61.0,94.0,0.02529,1300,1.0,80000,97.0,0.0,7
7,59.0,103.0,0.010854,1000,1.0,51000,97.0,1.0,12
8,61.0,240.0,0.017492,1000,0.0,82000,66.0,0.0,10
9,61.0,256.0,0.02529,1800,0.0,121400,80.0,0.0,13


In [181]:
X = data[PredictorVariables]
y = data[TargetVariables]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf_regressor = RandomForestRegressor()
rf_regressor.fit(X_train, y_train)
predictions = rf_regressor.predict(X_test)

mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error:", mse)

# Calculate R-squared
r_squared = r2_score(y_test, predictions)
print("R-squared:", r_squared)


Mean Squared Error: 9.152149975099591e-05
R-squared: 0.9448439744333299


In [182]:
residuals = y_test - predictions
print(residuals.head())

24427    0.001436
71683   -0.001602
61853   -0.000443
56348    0.000273
5652    -0.000918
Name: Price, dtype: float64


In [183]:
residuals.mean()

0.00015375653981978152

In [184]:
experimental_data_original = experimental_data.copy()
experimental_data_original = experimental_data.dropna()
experimental_data_original.head()

Unnamed: 0,Make,Model,Version,Price,CC,Assembly,Mileage,Registered City,Transmission,Vehicle_age
25,MG,HS,1.5 Turbo,Call for price,1500,Imported,5,Un-Registered,Automatic,3
27,MG,HS,1.5 Turbo,Call for price,1500,Local,5,Un-Registered,Automatic,3
43,Toyota,Hilux,Revo V Automatic 2.8,Call for price,2800,Local,5,Un-Registered,Automatic,3
50,Honda,Civic,Oriel 1.8 i-VTEC CVT,Call for price,1800,Local,63000,Islamabad,Automatic,6
78,Toyota,Aqua,S,Call for price,1500,Imported,45000,Un-Registered,Automatic,7


In [185]:
experimental_data = experimental_data.dropna()
ordinalEncoder = OrdinalEncoder()
experimental_data[CategoricalVariables] = ordinalEncoder.fit_transform(experimental_data[CategoricalVariables])
X_experimental = experimental_data[PredictorVariables]
predictions_experimental = rf_regressor.predict(X_experimental)

In [186]:
predictions_unstandardized = scaler.inverse_transform(predictions_experimental.reshape(-1, 1))
predictions_unstandardized_flat = predictions_unstandardized.flatten()
results = pd.DataFrame({'Predicted Price': predictions_unstandardized_flat})
results = pd.concat([results, experimental_data_original.reset_index(drop=True)], axis=1)
results['Predicted Price'] = results['Predicted Price'].apply(lambda x: round(x))
results.head(10)

Unnamed: 0,Predicted Price,Make,Model,Version,Price,CC,Assembly,Mileage,Registered City,Transmission,Vehicle_age
0,10633740,MG,HS,1.5 Turbo,Call for price,1500,Imported,5,Un-Registered,Automatic,3
1,6372383,MG,HS,1.5 Turbo,Call for price,1500,Local,5,Un-Registered,Automatic,3
2,40877950,Toyota,Hilux,Revo V Automatic 2.8,Call for price,2800,Local,5,Un-Registered,Automatic,3
3,8308750,Honda,Civic,Oriel 1.8 i-VTEC CVT,Call for price,1800,Local,63000,Islamabad,Automatic,6
4,3982110,Toyota,Aqua,S,Call for price,1500,Imported,45000,Un-Registered,Automatic,7
5,40841950,Toyota,Fortuner,2.8 Sigma 4,Call for price,2800,Local,20,Un-Registered,Automatic,3
6,3220410,Honda,BR-V,i-VTEC S,Call for price,1500,Local,20,Un-Registered,Automatic,3
7,10633740,MG,HS,1.5 Turbo,Call for price,1500,Imported,5,Un-Registered,Automatic,3
8,22004250,Toyota,Hilux,Revo V Automatic 3.0,Call for price,3000,Local,50263,Karachi,Automatic,7
9,2986382,Toyota,Corolla,Axio G,Call for price,1500,Imported,45000,Un-Registered,Automatic,6


In [187]:
results[results["Model"] =="HS"]

Unnamed: 0,Predicted Price,Make,Model,Version,Price,CC,Assembly,Mileage,Registered City,Transmission,Vehicle_age
0,10633740,MG,HS,1.5 Turbo,Call for price,1500,Imported,5,Un-Registered,Automatic,3
1,6372383,MG,HS,1.5 Turbo,Call for price,1500,Local,5,Un-Registered,Automatic,3
7,10633740,MG,HS,1.5 Turbo,Call for price,1500,Imported,5,Un-Registered,Automatic,3
49,6437473,MG,HS,1.5 Turbo,Call for price,1500,Local,50,Un-Registered,Automatic,3
57,6233610,MG,HS,1.5 Turbo,Call for price,1500,Local,20,Un-Registered,Automatic,3
62,6372383,MG,HS,1.5 Turbo,Call for price,1500,Local,5,Un-Registered,Automatic,3
71,6170458,MG,HS,1.5 Turbo,Call for price,1500,Local,10,Un-Registered,Automatic,3
203,6372383,MG,HS,1.5 Turbo,Call for price,1500,Local,5,Un-Registered,Automatic,3
206,6372383,MG,HS,1.5 Turbo,Call for price,1500,Local,5,Un-Registered,Automatic,3
220,6170458,MG,HS,1.5 Turbo,Call for price,1500,Local,10,Un-Registered,Automatic,3
