In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import norm
from scipy.stats import zscore
from scipy.stats import boxcox
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler
import re


In [13]:
# Adjust display options to show full content
pd.set_option('display.max_colwidth', None)  # No truncation for column content
pd.set_option('display.max_rows', None)      # Display all rows (set a limit if data is large)
pd.set_option('display.max_columns', None)   # Display all columns

In [4]:
# Define the output file path
output_path = r"D:\DataScience\GUVI\DataScience_GUVI_Projects\CarDheko_Project\DataSets\New_Structured_Data\Temp_preprocessed_data.csv"

# Write the DataFrame to a CSV file
df_HandlingOutliers = pd.read_csv(output_path)

In [5]:
# Select numerical columns
numerical_columns = df_HandlingOutliers.select_dtypes(include=['float64', 'int64']).columns
numerical_columns

Index(['km', 'ownerNo', 'modelYear', 'centralVariantId', 'price',
       'Registration Year', 'Features', 'Comfort & Convenience', 'Interior',
       'Exterior', 'Safety', 'Entertainment & Communication', 'Mileage',
       'Max Power', 'Torque', 'Displacement', 'No of Cylinder',
       'Values per Cylinder', 'Length', 'Width', 'Height', 'Wheel Base',
       'Front Tread', 'Rear Tread', 'Kerb Weight', 'Gear Box',
       'Seating Capacity', 'Turning Radius', 'Top Speed', 'Acceleration',
       'Cargo Volumn', 'Wheel Size'],
      dtype='object')

In [6]:
# Function to calculate IQR bounds and outlier counts
def detect_outliers_iqr(df_HandlingOutliers, columns):
    outlier_stats = []
    for col in columns:
        Q1 = df_HandlingOutliers[col].quantile(0.25)
        Q3 = df_HandlingOutliers[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Count outliers
        outliers = df_HandlingOutliers[(df_HandlingOutliers[col] < lower_bound) | (df_HandlingOutliers[col] > upper_bound)][col].count()
        outlier_stats.append({
            "Column": col,
            "Lower Bound": lower_bound,
            "Upper Bound": upper_bound,
            "Outliers": outliers
        })
    
    return pd.DataFrame(outlier_stats)

# Detect outliers using IQR
outlier_iqr_stats = detect_outliers_iqr(df_HandlingOutliers, numerical_columns)

outlier_iqr_stats

Unnamed: 0,Column,Lower Bound,Upper Bound,Outliers
0,km,-45000.0,155000.0,48
1,ownerNo,-0.5,3.5,95
2,modelYear,2006.5,2026.5,129
3,centralVariantId,-5529.0,14711.0,0
4,price,-462000.0,1842000.0,912
5,Registration Year,2006.5,2026.5,127
6,Features,9.0,9.0,1091
7,Comfort & Convenience,-2.5,33.5,19
8,Interior,4.5,16.5,138
9,Exterior,-3.5,24.5,24


In [7]:
def trim_outliers_iqr(df, columns):
    trimmed_df = df.copy()
    for col in columns:
        Q1 = trimmed_df[col].quantile(0.25)
        Q3 = trimmed_df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Remove rows with outliers
        trimmed_df = trimmed_df[(trimmed_df[col] >= lower_bound) & (trimmed_df[col] <= upper_bound)]
    
    return trimmed_df

# Apply trimming to remove outliers
trimmed_data = trim_outliers_iqr(df_HandlingOutliers, numerical_columns)

# Verify the trimmed dataset
print(f"Original data shape: {df_HandlingOutliers.shape}")
print(f"Trimmed data shape: {trimmed_data.shape}")


Original data shape: (8277, 52)
Trimmed data shape: (2464, 52)


In [8]:
def cap_outliers_iqr(df, columns):
    capped_df = df.copy()
    for col in columns:
        Q1 = capped_df[col].quantile(0.25)
        Q3 = capped_df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Cap outliers
        capped_df[col] = capped_df[col].apply(lambda x: lower_bound if x < lower_bound else upper_bound if x > upper_bound else x)
    
    return capped_df

# Apply capping to handle outliers
capped_data = cap_outliers_iqr(df_HandlingOutliers, numerical_columns)

# Verify the dataset after capping
print(f"Original data shape: {df_HandlingOutliers.shape}")
print(f"Capped data shape: {capped_data.shape}")


Original data shape: (8277, 52)
Capped data shape: (8277, 52)


In [9]:
# Compute the correlation matrix
correlation_matrix = df_HandlingOutliers[numerical_columns].corr()

# Extract correlations with the 'price' column, sorting them by magnitude
price_correlation = correlation_matrix['price'].sort_values(ascending=False)

# Display the correlations with 'price'
price_correlation

price                            1.000000
ownerNo                          0.131124
Acceleration                     0.086711
km                               0.037748
Cargo Volumn                     0.035572
Turning Radius                  -0.001804
Mileage                         -0.027921
Seating Capacity                -0.039360
No of Cylinder                  -0.044938
Height                          -0.051352
Displacement                    -0.070572
Gear Box                        -0.070717
Front Tread                     -0.078812
centralVariantId                -0.079006
Rear Tread                      -0.082431
Values per Cylinder             -0.083930
Max Power                       -0.086466
Top Speed                       -0.087861
Kerb Weight                     -0.091205
Torque                          -0.092822
Length                          -0.104184
Entertainment & Communication   -0.106625
Wheel Base                      -0.118598
Exterior                        -0

In [None]:
# Plot the full correlation heatmap for all numeric columns
plt.figure(figsize=(18, 14))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Full Correlation Heatmap')
plt.show()

In [10]:
df1 = df_HandlingOutliers.copy()

encoder = LabelEncoder()

for i in df1.select_dtypes(include="object").columns:
  df1[i] = encoder.fit_transform(df1[i])

In [14]:
df1.head(10)

Unnamed: 0,city,bt,km,ownerNo,oem,model,modelYear,centralVariantId,variantName,price,Registration Year,Insurance Validity,Fuel Type,RTO,Transmission,Features,Comfort & Convenience,Interior,Exterior,Safety,Entertainment & Communication,Mileage,Max Power,Torque,Color,Engine Type,Displacement,No of Cylinder,Values per Cylinder,Value Configuration,Fuel Suppy System,Turbo Charger,Super Charger,Length,Width,Height,Wheel Base,Front Tread,Rear Tread,Kerb Weight,Gear Box,Drive Type,Seating Capacity,Steering Type,Turning Radius,Front Brake Type,Rear Brake Type,Top Speed,Acceleration,Tyre Type,Cargo Volumn,Wheel Size
0,0,2,120000.0,3,17,160,2015,3979,1806,400000.0,2015.0,2,4,165,1,8,10.0,7.0,5.0,13.0,7.0,23.1,67.04,90.0,37,277,998.0,3.0,4.0,1,35,0,0,3715.0,1635.0,1565.0,2425.0,1420.0,1410.0,835.0,5.0,10,5.0,3,4.7,12,4,150.0,15.05,10,235.0,16.0
1,0,8,32706.0,2,6,50,2018,6087,288,811000.0,2018.0,0,4,135,1,9,17.0,12.0,12.0,27.0,9.0,17.0,121.31,150.0,37,386,1497.0,3.0,4.0,1,14,0,0,3998.0,1765.0,1647.0,2519.0,1519.0,1524.0,1242.0,5.0,10,5.0,3,5.3,12,4,171.43,12.51,10,352.0,16.0
2,0,2,11949.0,1,27,273,2018,2983,129,585000.0,2018.0,0,4,133,1,9,14.0,12.0,13.0,24.0,8.0,23.84,84.0,114.0,29,353,1199.0,3.0,4.0,1,35,0,0,3746.0,1647.0,1535.0,2400.0,1400.0,1420.0,1012.0,5.0,10,5.0,3,4.9,2,4,150.0,14.3,6,242.0,14.0
3,0,7,17794.0,1,9,87,2014,1867,101,462000.0,2014.0,0,4,167,1,9,16.0,10.0,10.0,18.0,7.0,19.1,81.86,113.75,23,304,1197.0,4.0,4.0,1,14,0,0,3995.0,1660.0,1520.0,2425.0,1479.0,1493.0,1180.0,5.0,11,5.0,3,4.7,2,4,172.0,14.2,10,407.0,14.0
4,0,8,60000.0,1,17,178,2015,4277,987,790000.0,2015.0,2,1,134,1,9,21.0,11.0,13.0,22.0,8.0,23.65,88.5,200.0,16,202,1248.0,4.0,4.0,1,35,2,0,4300.0,1785.0,1595.0,2600.0,1295.0,1290.0,1230.0,5.0,10,5.0,3,5.2,12,2,190.0,12.0,10,353.0,16.0
5,0,8,20000.0,1,12,101,2020,5931,485,1900000.0,2020.0,2,1,134,1,9,13.0,11.0,15.0,22.0,8.0,17.1,170.0,350.0,23,111,1956.0,4.0,4.0,1,9,2,0,4395.0,1818.0,1640.0,2636.0,1295.0,1290.0,1551.0,6.0,5,5.0,3,5.3,2,3,165.0,10.03,2,408.0,17.0
6,0,2,37772.0,1,4,37,2017,5223,1684,345000.0,2017.0,0,4,135,1,9,11.0,8.0,5.0,17.0,5.0,20.63,67.0,104.0,16,346,1198.0,3.0,4.0,1,20,0,0,3785.0,1635.0,1490.0,2450.0,1440.0,1445.0,1070.0,5.0,10,5.0,3,4.6,2,4,150.0,13.3,6,265.0,16.0
7,0,8,30000.0,1,9,85,2021,6946,1581,1200000.0,2021.0,2,4,165,0,9,20.0,11.0,12.0,31.0,6.0,18.15,118.35,171.6,23,298,998.0,3.0,4.0,1,30,2,0,3995.0,1770.0,1605.0,2500.0,1295.0,1290.0,1440.0,7.0,10,5.0,3,5.3,2,4,165.0,12.36,0,350.0,16.0
8,0,7,37000.0,1,17,163,2018,6555,834,960000.0,2018.0,0,4,133,0,9,22.0,12.0,11.0,29.0,8.0,20.28,103.25,138.0,17,290,1462.0,4.0,4.0,1,35,0,0,4490.0,1730.0,1485.0,2650.0,1495.0,1505.0,1105.0,4.0,10,5.0,3,5.4,12,4,190.0,14.0,10,510.0,16.0
9,0,2,11949.0,1,27,273,2017,2985,133,585000.0,2018.0,0,4,133,1,9,14.0,11.0,13.0,24.0,8.0,23.84,84.0,114.0,29,353,1199.0,3.0,4.0,1,35,0,0,3746.0,1647.0,1535.0,2400.0,1400.0,1420.0,1012.0,5.0,10,5.0,3,4.9,2,4,150.0,14.3,6,242.0,14.0


In [11]:
# Compute the correlation matrix
correlation_matrix1 = df1.corr()

# Extract correlations with the 'price' column, sorting them by magnitude
price_correlation1 = correlation_matrix1['price'].sort_values(ascending=False)

# Display the correlations with 'price'
price_correlation1

price                            1.000000
ownerNo                          0.131124
Value Configuration              0.118391
Acceleration                     0.086711
Fuel Suppy System                0.077096
Transmission                     0.066150
Insurance Validity               0.054739
Fuel Type                        0.050803
Front Brake Type                 0.038664
km                               0.037748
city                             0.036722
Cargo Volumn                     0.035572
Steering Type                    0.035569
Rear Brake Type                  0.016584
Tyre Type                        0.007356
Engine Type                      0.006606
model                            0.002980
Color                            0.002924
oem                             -0.001641
Turning Radius                  -0.001804
Super Charger                   -0.004912
Drive Type                      -0.005815
variantName                     -0.007621
RTO                             -0