In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import norm
from scipy.stats import zscore
from scipy.stats import boxcox
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler
import re


In [39]:
# Adjust display options to show full content
pd.set_option('display.max_colwidth', None)  # No truncation for column content
pd.set_option('display.max_rows', None)      # Display all rows (set a limit if data is large)
pd.set_option('display.max_columns', None)   # Display all columns

In [33]:
# Define the output file path
output_path = r"D:\DataScience\GUVI\DataScience_GUVI_Projects\CarDheko_Project\DataSets\New_Structured_Data\Temp_preprocessed_data.csv"

# Write the DataFrame to a CSV file
df_HandlingOutliers = pd.read_csv(output_path)

In [34]:
# Select numerical columns
numerical_columns = df_HandlingOutliers.select_dtypes(include=['float64', 'int64']).columns
numerical_columns

Index(['km', 'ownerNo', 'modelYear', 'centralVariantId', 'price',
       'Registration Year', 'Features', 'Comfort & Convenience', 'Interior',
       'Exterior', 'Safety', 'Entertainment & Communication', 'Mileage',
       'Max Power', 'Torque', 'Displacement', 'No of Cylinder',
       'Values per Cylinder', 'Length', 'Width', 'Height', 'Wheel Base',
       'Front Tread', 'Rear Tread', 'Kerb Weight', 'Gear Box',
       'Seating Capacity', 'Turning Radius', 'Top Speed', 'Acceleration',
       'Cargo Volumn', 'Wheel Size'],
      dtype='object')

# Detecting Outliers Using IQR and populating count in the Table 

In [35]:
from scipy.stats import zscore
import pandas as pd

# Function to calculate Z-score bounds and outlier counts
def detect_outliers_zscore(df_HandlingOutliers, columns, threshold=3):
    outlier_stats = []
    for col in columns:
        # Calculate Z-scores using scipy's zscore method
        z_scores = zscore(df_HandlingOutliers[col])
        
        # Count outliers based on the threshold
        outliers = (abs(z_scores) > threshold).sum()
        
        # Append results for this column
        outlier_stats.append({
            "Column": col,
            "Outliers": outliers
        })
    
    return pd.DataFrame(outlier_stats)

# Detect outliers using Z-score
outlier_zscore_stats = detect_outliers_zscore(df_HandlingOutliers, numerical_columns)

# Display the Z-score outlier statistics
outlier_zscore_stats


Unnamed: 0,Column,Outliers
0,km,5
1,ownerNo,95
2,modelYear,65
3,centralVariantId,0
4,price,107
5,Registration Year,65
6,Features,108
7,Comfort & Convenience,10
8,Interior,35
9,Exterior,24


Trimming the outliers using Z-score

In [36]:
def trim_outliers_zscore(df, columns, threshold=3):
    
    trimmed_df = df.copy()
    for col in columns:
        # Calculate Z-scores
        z_scores = zscore(trimmed_df[col], nan_policy='omit')
        
        # Remove rows with outliers
        trimmed_df = trimmed_df[abs(z_scores) <= threshold]
    
    # Reset index after trimming
    trimmed_df = trimmed_df.reset_index(drop=True)
    
    return trimmed_df
       

# Apply trimming to remove outliers using Z-score
trimmed_data_zscore = trim_outliers_zscore(df_HandlingOutliers, numerical_columns)

# Verify the trimmed dataset
print(f"Original data shape: {df_HandlingOutliers.shape}")
print(f"Trimmed data shape (Z-Score): {trimmed_data_zscore.shape}")


Original data shape: (8277, 52)
Trimmed data shape (Z-Score): (6352, 52)


In [37]:
encoder = LabelEncoder()

for i in trimmed_data_zscore.select_dtypes(include="object").columns:
  trimmed_data_zscore[i] = encoder.fit_transform(trimmed_data_zscore[i])

In [40]:
trimmed_data_zscore.head(10)

Unnamed: 0,city,bt,km,ownerNo,oem,model,modelYear,centralVariantId,variantName,price,Registration Year,Insurance Validity,Fuel Type,RTO,Transmission,Features,Comfort & Convenience,Interior,Exterior,Safety,Entertainment & Communication,Mileage,Max Power,Torque,Color,Engine Type,Displacement,No of Cylinder,Values per Cylinder,Value Configuration,Fuel Suppy System,Turbo Charger,Super Charger,Length,Width,Height,Wheel Base,Front Tread,Rear Tread,Kerb Weight,Gear Box,Drive Type,Seating Capacity,Steering Type,Turning Radius,Front Brake Type,Rear Brake Type,Top Speed,Acceleration,Tyre Type,Cargo Volumn,Wheel Size
0,0,0,120000.0,3,13,77,2015,3979,1117,400000.0,2015.0,2,3,129,1,8,10.0,0.352941,0.153846,0.255319,7.0,23.1,67.04,90.0,31,172,998.0,3.0,4.0,0,20,0,0,3715.0,1635.0,1565.0,2425.0,1420.0,1410.0,835.0,5.0,7,5.0,2,4.7,3,2,150.0,15.05,6,235.0,16.0
1,0,4,32706.0,2,6,22,2018,6087,250,811000.0,2018.0,0,3,101,1,9,17.0,0.647059,0.423077,0.553191,9.0,17.0,121.31,150.0,31,239,1497.0,3.0,4.0,0,8,0,0,3998.0,1765.0,1647.0,2519.0,1519.0,1524.0,1242.0,5.0,7,5.0,2,5.3,3,2,171.43,12.51,6,352.0,16.0
2,0,0,11949.0,1,21,133,2018,2983,119,585000.0,2018.0,0,3,99,1,9,14.0,0.647059,0.461538,0.489362,8.0,23.84,84.0,114.0,23,217,1199.0,3.0,4.0,0,20,0,0,3746.0,1647.0,1535.0,2400.0,1400.0,1420.0,1012.0,5.0,7,5.0,2,4.9,1,2,150.0,14.3,4,242.0,14.0
3,0,3,17794.0,1,8,51,2014,1867,92,462000.0,2014.0,0,3,131,1,9,16.0,0.529412,0.346154,0.361702,7.0,19.1,81.86,113.75,19,195,1197.0,4.0,4.0,0,8,0,0,3995.0,1660.0,1520.0,2425.0,1479.0,1493.0,1180.0,5.0,8,5.0,2,4.7,1,2,172.0,14.2,6,407.0,14.0
4,0,4,60000.0,1,13,88,2015,4277,587,790000.0,2015.0,2,1,100,1,9,21.0,0.588235,0.461538,0.446809,8.0,23.65,88.5,200.0,13,125,1248.0,4.0,4.0,0,20,2,0,4300.0,1785.0,1595.0,2600.0,1295.0,1290.0,1230.0,5.0,7,5.0,2,5.2,3,0,190.0,12.0,6,353.0,16.0
5,0,4,20000.0,1,9,57,2020,5931,395,1900000.0,2020.0,2,1,100,1,9,13.0,0.588235,0.538462,0.446809,8.0,17.1,170.0,350.0,19,90,1956.0,4.0,4.0,0,4,2,0,4395.0,1818.0,1640.0,2636.0,1295.0,1290.0,1551.0,6.0,3,5.0,2,5.3,1,1,165.0,10.03,1,408.0,17.0
6,0,0,37772.0,1,4,11,2017,5223,1032,345000.0,2017.0,0,3,101,1,9,11.0,0.411765,0.153846,0.340426,5.0,20.63,67.0,104.0,13,210,1198.0,3.0,4.0,0,9,0,0,3785.0,1635.0,1490.0,2450.0,1440.0,1445.0,1070.0,5.0,7,5.0,2,4.6,1,2,150.0,13.3,4,265.0,16.0
7,0,4,30000.0,1,8,49,2021,6946,956,1200000.0,2021.0,2,3,129,0,9,20.0,0.588235,0.423077,0.638298,6.0,18.15,118.35,171.6,19,189,998.0,3.0,4.0,0,17,2,0,3995.0,1770.0,1605.0,2500.0,1295.0,1290.0,1440.0,7.0,7,5.0,2,5.3,1,2,165.0,12.36,0,350.0,16.0
8,0,3,37000.0,1,13,79,2018,6555,495,960000.0,2018.0,0,3,99,0,9,22.0,0.647059,0.384615,0.595745,8.0,20.28,103.25,138.0,14,182,1462.0,4.0,4.0,0,20,0,0,4490.0,1730.0,1485.0,2650.0,1495.0,1505.0,1105.0,4.0,7,5.0,2,5.4,3,2,190.0,14.0,6,510.0,16.0
9,0,0,11949.0,1,21,133,2017,2985,123,585000.0,2018.0,0,3,99,1,9,14.0,0.588235,0.461538,0.489362,8.0,23.84,84.0,114.0,23,217,1199.0,3.0,4.0,0,20,0,0,3746.0,1647.0,1535.0,2400.0,1400.0,1420.0,1012.0,5.0,7,5.0,2,4.9,1,2,150.0,14.3,4,242.0,14.0


In [21]:
# Compute the correlation matrix
correlation_matrix_trimdata = trimmed_data_zscore.corr()

# Extract correlations with the 'price' column, sorting them by magnitude
price_correlation_trimdata = correlation_matrix_trimdata['price'].sort_values(ascending=False)

# Display the correlations with 'price'
price_correlation_trimdata

price                            1.000000
Max Power                        0.703753
Kerb Weight                      0.685041
Exterior                         0.669369
Safety                           0.653389
Width                            0.627003
Comfort & Convenience            0.626374
Wheel Base                       0.620752
Torque                           0.616395
Wheel Size                       0.608869
Gear Box                         0.581642
Entertainment & Communication    0.574330
modelYear                        0.570359
Registration Year                0.567702
Turning Radius                   0.560687
Length                           0.558129
Interior                         0.540999
bt                               0.531999
Displacement                     0.530066
Turbo Charger                    0.410366
centralVariantId                 0.369980
Cargo Volumn                     0.355768
Height                           0.353159
Top Speed                        0

Capping the outliers using Z-score

In [None]:
def cap_outliers_zscore(df, columns, threshold=3):
    capped_df = df.copy()
    for col in columns:
        # Calculate Z-scores
        z_scores = zscore(capped_df[col], nan_policy='omit')
        
        # Calculate bounds based on Z-score threshold
        mean = capped_df[col].mean()
        std = capped_df[col].std()
        lower_bound = mean - threshold * std
        upper_bound = mean + threshold * std

        # Cap values outside the bounds
        capped_df[col] = capped_df[col].apply(
            lambda x: lower_bound if x < lower_bound else upper_bound if x > upper_bound else x
        )
    
    return capped_df

# Apply capping to handle outliers using Z-score
capped_data_zscore = cap_outliers_zscore(df_HandlingOutliers, numerical_columns)

# Verify the dataset after capping
print(f"Original data shape: {df_HandlingOutliers.shape}")
print(f"Capped data shape (Z-Score): {capped_data_zscore.shape}")


Original data shape: (8277, 52)
Capped data shape (Z-Score): (8277, 52)


In [23]:
encoder = LabelEncoder()

for i in capped_data_zscore.select_dtypes(include="object").columns:
  capped_data_zscore[i] = encoder.fit_transform(capped_data_zscore[i])

In [24]:
capped_data_zscore.head(10)

Unnamed: 0,city,bt,km,ownerNo,oem,model,modelYear,centralVariantId,variantName,price,...,Seating Capacity,Steering Type,Turning Radius,Front Brake Type,Rear Brake Type,Top Speed,Acceleration,Tyre Type,Cargo Volumn,Wheel Size
0,0,2,120000.0,3.0,17,160,2015.0,3979,1806,400000.0,...,5.0,3,4.7,12,4,150.0,15.05,10,235.0,16.0
1,0,8,32706.0,2.0,6,50,2018.0,6087,288,811000.0,...,5.0,3,5.3,12,4,171.43,12.51,10,352.0,16.0
2,0,2,11949.0,1.0,27,273,2018.0,2983,129,585000.0,...,5.0,3,4.9,2,4,150.0,14.3,6,242.0,14.0
3,0,7,17794.0,1.0,9,87,2014.0,1867,101,462000.0,...,5.0,3,4.7,2,4,172.0,14.2,10,407.0,14.0
4,0,8,60000.0,1.0,17,178,2015.0,4277,987,790000.0,...,5.0,3,5.2,12,2,190.0,12.0,10,353.0,16.0
5,0,8,20000.0,1.0,12,101,2020.0,5931,485,1900000.0,...,5.0,3,5.3,2,3,165.0,10.03,2,408.0,17.0
6,0,2,37772.0,1.0,4,37,2017.0,5223,1684,345000.0,...,5.0,3,4.6,2,4,150.0,13.3,6,265.0,16.0
7,0,8,30000.0,1.0,9,85,2021.0,6946,1581,1200000.0,...,5.0,3,5.3,2,4,165.0,12.36,0,350.0,16.0
8,0,7,37000.0,1.0,17,163,2018.0,6555,834,960000.0,...,5.0,3,5.4,12,4,190.0,14.0,10,510.0,16.0
9,0,2,11949.0,1.0,27,273,2017.0,2985,133,585000.0,...,5.0,3,4.9,2,4,150.0,14.3,6,242.0,14.0


In [25]:
# Compute the correlation matrix
correlation_matrix_cappeddata = capped_data_zscore.corr()

# Extract correlations with the 'price' column, sorting them by magnitude
price_correlation_cappeddata = correlation_matrix_cappeddata['price'].sort_values(ascending=False)

# Display the correlations with 'price'
price_correlation_cappeddata

price                            1.000000
ownerNo                          0.130947
Value Configuration              0.126671
Acceleration                     0.099960
km                               0.084938
Fuel Suppy System                0.078664
Transmission                     0.066893
Insurance Validity               0.057607
Fuel Type                        0.053484
Cargo Volumn                     0.045393
city                             0.039916
Steering Type                    0.035525
Front Brake Type                 0.034014
Rear Brake Type                  0.017987
Engine Type                      0.006487
model                            0.003637
Color                            0.002565
oem                             -0.000327
Tyre Type                       -0.003119
Super Charger                   -0.004903
variantName                     -0.006630
Drive Type                      -0.010704
Turning Radius                  -0.018617
RTO                             -0

In [26]:
# Combine both correlations into a single DataFrame for comparison
correlation_comparison = pd.DataFrame({
    'Feature': price_correlation_cappeddata.index,
    'Capped Data Correlation': price_correlation_cappeddata.values,
    'Trimmed Data Correlation': price_correlation_trimdata.reindex(price_correlation_cappeddata.index).values
})


# Comparing Correlations Between Capped and Trimmed Data
- After analyzing the correlations of features with the target variable ('price') in both capped and trimmed datasets:
  - The **trimmed dataset** retains more meaningful relationships with the target variable.
  - This indicates that trimming outliers improves the quality of the data for modeling compared to capping.

# Final Decision
- **Use the Trimmed Dataset**:
  - The trimmed data will be used as the final dataset for training the regression model.
  - This decision ensures the model is trained on cleaner data with stronger feature-target relationships.


In [27]:
correlation_comparison

Unnamed: 0,Feature,Capped Data Correlation,Trimmed Data Correlation
0,price,1.0,1.0
1,ownerNo,0.130947,-0.214791
2,Value Configuration,0.126671,-0.131602
3,Acceleration,0.09996,-0.432388
4,km,0.084938,-0.312453
5,Fuel Suppy System,0.078664,-0.094378
6,Transmission,0.066893,-0.370264
7,Insurance Validity,0.057607,-0.032358
8,Fuel Type,0.053484,-0.150637
9,Cargo Volumn,0.045393,0.355768


In [28]:
# Create the Final_Data2 DataFrame by selecting specific columns
Final_Data3 = trimmed_data_zscore[['oem', 'model', 'modelYear', 'Registration Year', 'Mileage', 
                  'Fuel Type', 'Transmission', 'ownerNo', 'price', 'Gear Box', 
                  'city', 'km', 'Safety', 'Interior', 'Exterior', 
                  'Insurance Validity', 'bt']]

In [29]:
# Save the Final_Data DataFrame to a CSV file in the specified directory.
Final_Data3.to_csv(r"D:\DataScience\GUVI\DataScience_GUVI_Projects\CarDheko_Project\DataSets\New_Structured_Data\Model_TrainTest_data.csv",index=False)

*below code is for Previously followed method for Handling Outliers*

In [39]:
# # Create the Final_Data2 DataFrame by selecting specific columns
# Final_Data2 = capped_data[['oem', 'model', 'modelYear', 'Registration Year', 'Mileage', 
#                   'Fuel Type', 'Transmission', 'ownerNo', 'price', 'Gear Box', 
#                   'city', 'km', 'Safety', 'Interior', 'Exterior', 
#                   'Insurance Validity', 'bt']]

In [40]:
# # Save the Final_Data DataFrame to a CSV file in the specified directory.
# Final_Data2.to_csv(r"D:\DataScience\GUVI\DataScience_GUVI_Projects\CarDheko_Project\DataSets\New_Structured_Data\TechnicalBased_Handled_data.csv",index=False)