In [1]:
import numpy as np 
import pandas as pd     

from scipy.stats import zscore
from statsmodels.stats.weightstats import ztest
from scipy.stats import ttest_ind
from sklearn.decomposition import PCA


import warnings         
warnings.filterwarnings("ignore")


In [2]:
dfe = pd.read_csv("dataset/encoded-train.csv")

In [3]:
# EDA - Statistical Analysis
# Stat Summary
print("Dataset Describe:")
dfe.describe()

Dataset Describe:


Unnamed: 0,ODO,Year,Age,Mileage,Engine,Power,Seats,CP,SP,Fuel_Diesel,Fuel_Petrol,Transmission_Automatic,Transmission_Manual,Owner_First,Owner_Fourth & Above,Owner_Second,Owner_Third
count,5807.0,5807.0,5807.0,5807.0,5807.0,5807.0,5807.0,5807.0,5807.0,5807.0,5807.0,5807.0,5807.0,5807.0,5807.0,5807.0,5807.0
mean,58349.02,2013.475805,9.524195,18.206192,1631.839332,113.827634,5.286551,3.263546,9.673429,0.542793,0.457207,0.293095,0.706905,0.824178,0.001205,0.157224,0.017393
std,92655.56,3.170718,3.170718,4.288879,601.822651,53.903495,0.80679,12.728419,11.292012,0.498208,0.498208,0.455221,0.455221,0.380702,0.034702,0.364043,0.130741
min,171.0,1998.0,4.0,0.0,624.0,34.2,2.0,0.0,0.44,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,33288.0,2012.0,7.0,15.26,1198.0,78.0,5.0,0.0,3.59,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
50%,52400.0,2014.0,9.0,18.2,1497.0,98.6,5.0,0.0,5.75,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
75%,72457.5,2016.0,11.0,21.1,1991.0,139.04,5.0,0.0,10.25,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
max,6500000.0,2019.0,25.0,28.4,5998.0,560.0,10.0,230.0,160.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [4]:
# Correlation for numerical values
correlation_matrix = dfe.iloc[:,5:].corr()

# Convert correlation matrix to text and print
correlation_text = correlation_matrix.to_string()

print("Correlation matrix")
print(correlation_text)

Correlation matrix
                         Mileage    Engine     Power     Seats        CP        SP  Fuel_Diesel  Fuel_Petrol  Transmission_Automatic  Transmission_Manual  Owner_First  Owner_Fourth & Above  Owner_Second  Owner_Third
Mileage                 1.000000 -0.637154 -0.536648 -0.330440 -0.038795 -0.340675     0.118617    -0.118617               -0.367234             0.367234     0.144298             -0.037777     -0.120769    -0.073874
Engine                 -0.637154  1.000000  0.865013  0.400124  0.191302  0.656887     0.424221    -0.424221                0.499214            -0.499214    -0.057533              0.014871      0.049570     0.025557
Power                  -0.536648  0.865013  1.000000  0.098152  0.293179  0.772422     0.284726    -0.284726                0.642933            -0.642933    -0.030448             -0.005644      0.031570     0.002254
Seats                  -0.330440  0.400124  0.098152  1.000000  0.003365  0.053251     0.308860    -0.308860         

In [5]:

threshold = dfe['SP'].median()
high_sp = dfe[dfe['SP'] > threshold]  
low_sp = dfe[dfe['SP'] <= threshold]  

zscore_high = zscore(high_sp['ODO'])
zscore_low = zscore(low_sp['ODO'])

# Perform Z-test
z_statistic, p_value = ztest(zscore_high, zscore_low)
print(f"Z-test for ODO: Z-statistic = {z_statistic}, p-value = {p_value}")

# Perform T-test
t_statistic, p_value = ttest_ind(high_sp['ODO'], low_sp['ODO'])
print(f"T-test for ODO: T-statistic = {t_statistic}, p-value = {p_value}")


Z-test for ODO: Z-statistic = -1.8086845396966537e-16, p-value = 0.9999999999999999
T-test for ODO: T-statistic = -2.744782339504185, p-value = 0.0060737289787887545


In [6]:
numerical_cols = dfe.iloc[:, 2:]
print(numerical_cols.columns)
# Perform PCA
pca = PCA(n_components=16)  # Choose the number of components

X_pca = pca.fit_transform(numerical_cols.drop('SP', axis=1))  # Considering all features except 'SP'

# Calculate correlation between components and 'SP'
pca_df = pd.DataFrame(X_pca, columns=[f"PC{i}" for i in range(1, pca.n_components_ + 1)])
pca_df['SP'] = dfe['SP']
pca_corr = pca_df.corr()

# Analyze correlation of components with 'SP'
print("PCA")
print(pca_corr['SP'])

Index(['ODO', 'Year', 'Age', 'Mileage', 'Engine', 'Power', 'Seats', 'CP', 'SP',
       'Fuel_Diesel', 'Fuel_Petrol', 'Transmission_Automatic',
       'Transmission_Manual', 'Owner_First', 'Owner_Fourth & Above',
       'Owner_Second', 'Owner_Third'],
      dtype='object')
PCA
PC1    -0.008328
PC2     0.661960
PC3     0.413524
PC4     0.185586
PC5     0.234239
PC6     0.087454
PC7     0.049281
PC8    -0.118328
PC9    -0.030860
PC10   -0.011448
PC11    0.013274
PC12    0.000704
PC13    0.005184
PC14    0.001690
PC15    0.004646
PC16    0.000132
SP      1.000000
Name: SP, dtype: float64
