In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

data = pd.read_csv('CSV_files/car_price.csv')

display(data.head())
print("Dataset Shape:", data.shape)
display(data.info())
display(data.describe())

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage(km/ltr/kg),engine,max_power,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4,1248.0,74.0,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14,1498.0,103.52,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7,1497.0,78.0,5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0,1396.0,90.0,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1,1298.0,88.2,5.0


Dataset Shape: (8128, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   name                8128 non-null   object 
 1   year                8128 non-null   int64  
 2   selling_price       8128 non-null   int64  
 3   km_driven           8128 non-null   int64  
 4   fuel                8128 non-null   object 
 5   seller_type         8128 non-null   object 
 6   transmission        8128 non-null   object 
 7   owner               8128 non-null   object 
 8   mileage(km/ltr/kg)  7907 non-null   float64
 9   engine              7907 non-null   float64
 10  max_power           7913 non-null   object 
 11  seats               7907 non-null   float64
dtypes: float64(3), int64(3), object(6)
memory usage: 762.1+ KB


None

Unnamed: 0,year,selling_price,km_driven,mileage(km/ltr/kg),engine,seats
count,8128.0,8128.0,8128.0,7907.0,7907.0,7907.0
mean,2013.804011,638271.8,69819.51,19.418783,1458.625016,5.416719
std,4.044249,806253.4,56550.55,4.037145,503.916303,0.959588
min,1983.0,29999.0,1.0,0.0,624.0,2.0
25%,2011.0,254999.0,35000.0,16.78,1197.0,5.0
50%,2015.0,450000.0,60000.0,19.3,1248.0,5.0
75%,2017.0,675000.0,98000.0,22.32,1582.0,5.0
max,2020.0,10000000.0,2360457.0,42.0,3604.0,14.0


In [None]:
g = sns.pairplot(data[['selling_price', 'year', 'mileage(km/ltr/kg)', 'engine']], 
                plot_kws={'alpha':0.6, 'edgecolor':'white', 'linewidth':0.5},
                diag_kws={'color':'#4ECDC4', 'edgecolor':'white'})
g.fig.suptitle('Feature Relationships', y=1.02, fontsize=14, fontweight='bold')
plt.savefig('feature_relationships.png', dpi=300, bbox_inches='tight')
plt.show()

plt.figure(figsize=(12, 6))
ax = sns.boxplot(x='fuel', y='selling_price', data=data, 
                palette='Set2', width=0.6,
                flierprops={'marker':'o', 'markersize':4, 'markerfacecolor':'none', 'markeredgecolor':'gray'})
plt.title('Price Distribution by Fuel Type', fontsize=14, fontweight='bold', pad=20)
plt.xlabel('Fuel Type', fontsize=12)
plt.ylabel('Selling Price', fontsize=12)
plt.grid(axis='y', alpha=0.3)
sns.despine()
plt.savefig('price_distribution_by_fuel.png', dpi=300, bbox_inches='tight')
plt.show();

In [None]:
sns.heatmap(data.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()


In [3]:
X = data.drop(columns=['name', 'selling_price'])  # Features
y = data['selling_price']  # Target

num_features = ['year', 'km_driven', 'mileage(km/ltr/kg)', 'engine', 'max_power', 'seats']
cat_features = ['fuel', 'seller_type', 'transmission', 'owner']

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
])


In [4]:
print(data.isnull().sum())

name                    0
year                    0
selling_price           0
km_driven               0
fuel                    0
seller_type             0
transmission            0
owner                   0
mileage(km/ltr/kg)    221
engine                221
max_power             215
seats                 221
dtype: int64


In [5]:
data = data.drop('engine', axis=1)
data = data.drop('mileage(km/ltr/kg)', axis=1)
data = data.drop('max_power', axis=1)
data = data.drop('seats', axis=1)

In [6]:
print(data.isnull().sum())

name             0
year             0
selling_price    0
km_driven        0
fuel             0
seller_type      0
transmission     0
owner            0
dtype: int64


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

model.fit(X_train, y_train)


ValueError: could not convert string to float: ' '

In [None]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regression Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Low Price', 'High Price'], yticklabels=['Low Price', 'High Price'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [8]:
cleaned_file = 'car_price_cleaned.csv'
data.to_csv(cleaned_file, index=False)
print(f"\nCleaned dataset saved as '{cleaned_file}'")


Cleaned dataset saved as 'car_price_cleaned.csv'
