Starting

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
import warnings
warnings.filterwarnings('ignore')


df = pd.read_csv('Car Data.csv')

print("Initial Dataset Info:")
print("=" * 50)
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print("\nData Types:")
print(df.dtypes)
print("\nMissing Values:")
print(df.isnull().sum())
print("\nBasic Statistics:")
print(df.describe())

Initial Dataset Info:
Shape: (550, 12)
Columns: ['city_mpg', 'class', 'combination_mpg', 'cylinders', 'displacement', 'drive', 'fuel_type', 'highway_mpg', 'make', 'model', 'transmission', 'year']

Data Types:
city_mpg             int64
class               object
combination_mpg      int64
cylinders          float64
displacement       float64
drive               object
fuel_type           object
highway_mpg          int64
make                object
model               object
transmission        object
year                 int64
dtype: object

Missing Values:
city_mpg           0
class              0
combination_mpg    0
cylinders          2
displacement       2
drive              0
fuel_type          0
highway_mpg        0
make               0
model              0
transmission       0
year               0
dtype: int64

Basic Statistics:
         city_mpg  combination_mpg   cylinders  displacement  highway_mpg  \
count  550.000000       550.000000  548.000000    548.000000   550.000000  

Data Cleaning & Initial Analysis

In [2]:

duplicates = df.duplicated().sum()
print(f"Duplicate rows: {duplicates}")


categorical_cols = ['class', 'drive', 'fuel_type', 'make', 'transmission']
for col in categorical_cols:
    print(f"\n{col}: {df[col].nunique()} unique values")
    if df[col].nunique() < 20:
        print(f"Values: {df[col].unique()}")


print("\nRows with missing displacement/cylinders (electric cars):")
electric_cars = df[df['displacement'].isna() | df['cylinders'].isna()]
print(electric_cars[['make', 'model', 'fuel_type', 'displacement', 'cylinders']])

Duplicate rows: 2

class: 13 unique values
Values: ['midsize car' 'small sport utility vehicle' 'subcompact car' 'large car'
 'two seater' 'minicompact car' 'standard sport utility vehicle'
 'compact car' 'small station wagon' 'standard pickup truck' 'minivan'
 'small pickup truck' 'midsize station wagon']

drive: 4 unique values
Values: ['fwd' '4wd' 'rwd' 'awd']

fuel_type: 3 unique values
Values: ['gas' 'diesel' 'electricity']

make: 31 unique values

transmission: 2 unique values
Values: ['m' 'a']

Rows with missing displacement/cylinders (electric cars):
           make   model    fuel_type  displacement  cylinders
128  mitsubishi  i-miev  electricity           NaN        NaN
175  mitsubishi  i-miev  electricity           NaN        NaN


Handle Missing Values and Clean Data

In [3]:

df['displacement'] = df['displacement'].fillna(0)
df['cylinders'] = df['cylinders'].fillna(0)


df['model'] = df['model'].str.strip()


print(f"\nMissing values after cleaning:")
print(df.isnull().sum())


print(f"\nUnique fuel types: {df['fuel_type'].unique()}")
print(f"Unique transmission types: {df['transmission'].unique()}")


Missing values after cleaning:
city_mpg           0
class              0
combination_mpg    0
cylinders          0
displacement       0
drive              0
fuel_type          0
highway_mpg        0
make               0
model              0
transmission       0
year               0
dtype: int64

Unique fuel types: ['gas' 'diesel' 'electricity']
Unique transmission types: ['m' 'a']


Feature Engineering

In [4]:

df['mpg_diff'] = df['highway_mpg'] - df['city_mpg']
df['engine_efficiency'] = df['combination_mpg'] / (df['displacement'] + 0.01)  # Add small value to avoid division by zero
df['engine_power_density'] = df['displacement'] / (df['cylinders'] + 0.01)


size_mapping = {
    'minicompact car': 'small',
    'subcompact car': 'small',
    'compact car': 'small',
    'midsize car': 'medium',
    'large car': 'large',
    'two seater': 'sports',
    'small station wagon': 'medium',
    'midsize station wagon': 'medium',
    'small sport utility vehicle': 'suv_small',
    'standard sport utility vehicle': 'suv_large',
    'small pickup truck': 'truck_small',
    'standard pickup truck': 'truck_large',
    'minivan': 'minivan'
}
df['size_category'] = df['class'].map(size_mapping)

drivetrain_mapping = {
    'fwd': 'front_wheel',
    'rwd': 'rear_wheel',
    'awd': 'all_wheel',
    '4wd': 'four_wheel',
    '4matic': 'all_wheel'
}
df['drivetrain_type'] = df['drive'].map(drivetrain_mapping)


current_year = 2024
df['vehicle_age'] = current_year - df['year']


df['fuel_efficiency_category'] = pd.cut(
    df['combination_mpg'],
    bins=[0, 20, 30, 40, float('inf')],
    labels=['low', 'medium', 'high', 'very_high']
)


df['is_high_performance'] = ((df['cylinders'] >= 6) & (df['combination_mpg'] < 25)).astype(int)


luxury_brands = ['audi', 'bmw', 'mercedes-benz', 'lexus', 'acura', 'infiniti', 'cadillac',
                 'jaguar', 'land rover', 'porsche', 'genesis', 'bentley', 'aston martin', 'roush performance']
df['is_luxury'] = df['make'].str.lower().isin(luxury_brands).astype(int)

df['is_electric'] = (df['fuel_type'] == 'electricity').astype(int)
df['is_diesel'] = (df['fuel_type'] == 'diesel').astype(int)
df['is_hybrid'] = df['model'].str.lower().str.contains('hybrid').astype(int)


df['is_automatic'] = df['transmission'].str.lower().str.contains('a').astype(int)
df['is_manual'] = df['transmission'].str.lower().str.contains('m').astype(int)

Encode Categorical Variables


In [5]:

le_make = LabelEncoder()
le_class = LabelEncoder()
le_drive = LabelEncoder()
le_fuel = LabelEncoder()
le_transmission = LabelEncoder()
le_size = LabelEncoder()
le_drivetrain = LabelEncoder()

df['make_encoded'] = le_make.fit_transform(df['make'])
df['class_encoded'] = le_class.fit_transform(df['class'])
df['drive_encoded'] = le_drive.fit_transform(df['drive'])
df['fuel_type_encoded'] = le_fuel.fit_transform(df['fuel_type'])
df['transmission_encoded'] = le_transmission.fit_transform(df['transmission'])
df['size_category_encoded'] = le_size.fit_transform(df['size_category'].fillna('unknown'))
df['drivetrain_type_encoded'] = le_drivetrain.fit_transform(df['drivetrain_type'].fillna('unknown'))


encoder_dict = {
    'make': dict(zip(le_make.classes_, le_make.transform(le_make.classes_))),
    'class': dict(zip(le_class.classes_, le_class.transform(le_class.classes_))),
    'drive': dict(zip(le_drive.classes_, le_drive.transform(le_drive.classes_))),
    'fuel_type': dict(zip(le_fuel.classes_, le_fuel.transform(le_fuel.classes_))),
    'transmission': dict(zip(le_transmission.classes_, le_transmission.transform(le_transmission.classes_)))
}

print("\nEncoded categorical variables:")
print(f"Make: {len(encoder_dict['make'])} unique values")
print(f"Class: {len(encoder_dict['class'])} unique values")
print(f"Drive: {encoder_dict['drive']}")


Encoded categorical variables:
Make: 31 unique values
Class: 13 unique values
Drive: {'4wd': np.int64(0), 'awd': np.int64(1), 'fwd': np.int64(2), 'rwd': np.int64(3)}


Create Scaled Numerical Features

In [6]:

numerical_features = [
    'city_mpg', 'combination_mpg', 'cylinders', 'displacement',
    'highway_mpg', 'year', 'vehicle_age', 'engine_efficiency'
]


scaler = StandardScaler()
scaled_values = scaler.fit_transform(df[numerical_features])
scaled_df = pd.DataFrame(
    scaled_values,
    columns=[f'{col}_scaled' for col in numerical_features]
)


df = pd.concat([df, scaled_df], axis=1)

print(f"\nCreated {len(scaled_df.columns)} scaled numerical features")


Created 8 scaled numerical features


Saving Data

In [7]:
df.to_csv("Car Data P.csv", index=False)