In [5]:
# Importing the libraries
import pandas as pd
import kagglehub
import os
import numpy as np

In [2]:
path = kagglehub.dataset_download("chavindudulaj/vehicle-maintenance-data")
print(path)
# Load the dataset
csv_path = os.path.join(path, "vehicle_maintenance_data.csv")
data = pd.read_csv(csv_path)
print(data.head())

Using Colab cache for faster access to the 'vehicle-maintenance-data' dataset.
/kaggle/input/vehicle-maintenance-data
  Vehicle_Model  Mileage Maintenance_History  Reported_Issues  Vehicle_Age  \
0         Truck    58765                Good                0            4   
1           Van    60353             Average                1            7   
2           Bus    68072                Poor                0            2   
3           Bus    60849             Average                4            5   
4           Bus    45742                Poor                5            1   

  Fuel_Type Transmission_Type  Engine_Size  Odometer_Reading  \
0  Electric         Automatic         2000             28524   
1  Electric         Automatic         2500            133630   
2  Electric         Automatic         1500             34022   
3    Petrol         Automatic         2500             81636   
4    Petrol            Manual         2000             97162   

  Last_Service_Date Warranty

In [None]:
# Data Validation 
#imbalance check
imbalance = data['Need_Maintenance'].value_counts(normalize=True)
imbalance

Unnamed: 0_level_0,proportion
Need_Maintenance,Unnamed: 1_level_1
1,0.80996
0,0.19004


In [None]:
# correlation matrix
corr_matrix = data.select_dtypes(include=[np.number]).corr()
corr_matrix

Unnamed: 0,Mileage,Reported_Issues,Vehicle_Age,Engine_Size,Odometer_Reading,Insurance_Premium,Service_History,Accident_History,Fuel_Efficiency,Need_Maintenance
Mileage,1.0,0.004311,0.000341,0.006994,-0.002929,-0.004834,-0.001166,0.002185,0.012465,0.000704
Reported_Issues,0.004311,1.0,-0.004607,-0.000349,0.000119,0.000248,-0.002567,0.007023,0.003954,0.389273
Vehicle_Age,0.000341,-0.004607,1.0,-0.006184,-0.000556,0.003263,0.001983,-0.005237,-0.004572,0.000806
Engine_Size,0.006994,-0.000349,-0.006184,1.0,-0.000369,-0.009805,0.009459,0.000589,-0.000645,-0.000326
Odometer_Reading,-0.002929,0.000119,-0.000556,-0.000369,1.0,-0.009439,0.000107,0.001599,0.004281,0.007876
Insurance_Premium,-0.004834,0.000248,0.003263,-0.009805,-0.009439,1.0,-0.007262,0.00012,-0.003988,0.004775
Service_History,-0.001166,-0.002567,0.001983,0.009459,0.000107,-0.007262,1.0,0.003199,0.004532,0.104155
Accident_History,0.002185,0.007023,-0.005237,0.000589,0.001599,0.00012,0.003199,1.0,0.002825,0.08059
Fuel_Efficiency,0.012465,0.003954,-0.004572,-0.000645,0.004281,-0.003988,0.004532,0.002825,1.0,0.001874
Need_Maintenance,0.000704,0.389273,0.000806,-0.000326,0.007876,0.004775,0.104155,0.08059,0.001874,1.0


In [10]:
# Data Cleaning
# Remove duplicates
data = data.drop_duplicates()

In [11]:
# Romove outliers using IQR method
for col in ['Mileage', 'Engine_Size', 'Odometer_Reading']:
        if col in data.columns:
            Q1 = data[col].quantile(0.25)
            Q3 = data[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            data = data[(data[col] >= lower_bound) & (data[col] <= upper_bound)]

In [13]:
# Data Standardization
# stripping white spaces
cat_cols = data.select_dtypes(include=['object']).columns
for col in cat_cols:
    data[col] = data[col].str.strip().str.title()
data

Unnamed: 0,Vehicle_Model,Mileage,Maintenance_History,Reported_Issues,Vehicle_Age,Fuel_Type,Transmission_Type,Engine_Size,Odometer_Reading,Last_Service_Date,Warranty_Expiry_Date,Owner_Type,Insurance_Premium,Service_History,Accident_History,Fuel_Efficiency,Tire_Condition,Brake_Condition,Battery_Status,Need_Maintenance
0,Truck,58765,Good,0,4,Electric,Automatic,2000,28524,2023-11-23,2025-06-24,Second,20782,6,3,13.622204,New,New,Weak,1
1,Van,60353,Average,1,7,Electric,Automatic,2500,133630,2023-09-21,2025-06-04,Second,23489,7,0,13.625307,New,New,Weak,1
2,Bus,68072,Poor,0,2,Electric,Automatic,1500,34022,2023-06-27,2025-04-27,First,17979,7,0,14.306302,New,Good,Weak,1
3,Bus,60849,Average,4,5,Petrol,Automatic,2500,81636,2023-08-24,2025-11-05,Second,6220,7,3,18.709467,New,Worn Out,New,1
4,Bus,45742,Poor,5,1,Petrol,Manual,2000,97162,2023-05-25,2025-09-14,Third,16446,6,2,16.977482,Good,Good,Weak,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,Car,77229,Good,3,6,Diesel,Automatic,2500,97404,2023-11-24,2025-09-15,Third,5217,4,1,10.357298,New,Good,Weak,1
49996,Truck,54742,Good,4,4,Diesel,Manual,800,13421,2023-07-14,2025-01-21,Second,15856,4,1,14.924022,Worn Out,New,Good,1
49997,Van,75601,Poor,0,4,Electric,Automatic,1500,124851,2023-09-09,2024-11-09,First,6019,10,0,19.859243,Worn Out,Worn Out,Weak,1
49998,Car,66181,Good,1,7,Electric,Manual,2500,103182,2023-08-26,2025-05-22,Third,5030,4,3,10.720555,New,Good,Weak,1


In [24]:
# feature engineering
# Aging Factor
data['aging_factor'] = data['Vehicle_Age'] ** 1.5  # Exponential relationship
data['aging_factor']

Unnamed: 0,aging_factor
0,8.000000
1,18.520259
2,2.828427
3,11.180340
4,1.000000
...,...
49995,14.696938
49996,8.000000
49997,8.000000
49998,18.520259


In [None]:
data['mileage_age_interaction'] = data['Mileage'] * data['Vehicle_Age']
data['issues_mileage_interaction'] = data['Reported_Issues'] * data['Mileage']

In [25]:
# maintainance risk score
data['maintenance_risk_score'] = (
        (data['Reported_Issues'] * 12) + 
        (data['Mileage'] * 0.5) + 
        (data['Vehicle_Age'] * 3)
    ).clip(0, 100)
data['maintenance_risk_score']

Unnamed: 0,maintenance_risk_score
0,100.0
1,100.0
2,100.0
3,100.0
4,100.0
...,...
49995,100.0
49996,100.0
49997,100.0
49998,100.0


In [28]:
# since maintenance_risk_score is of no use all values are 100
data.drop(columns=['maintenance_risk_score'], inplace=True)

In [29]:
# date formating
data['Last_Service_Date'] = pd.to_datetime(data['Last_Service_Date'])
data['Warranty_Expiry_Date'] = pd.to_datetime(data['Warranty_Expiry_Date'])

In [None]:

data['Days_Since_Last_Service'] = (pd.Timestamp.today() - data['Last_Service_Date']).dt.days
data['Warranty_Remaining_Days'] = (data['Warranty_Expiry_Date'] - pd.Timestamp.today()).dt.days

In [None]:
#remove original date columns
data.drop(columns=['Last_Service_Date','Warranty_Expiry_Date'], inplace=True)

In [32]:
# One-hot encoding
data = pd.get_dummies(data, drop_first=True)
data

Unnamed: 0,Vehicle_Model,Mileage,Maintenance_History,Reported_Issues,Vehicle_Age,Fuel_Type,Transmission_Type,Engine_Size,Odometer_Reading,Owner_Type,...,Service_History,Accident_History,Fuel_Efficiency,Tire_Condition,Brake_Condition,Battery_Status,Need_Maintenance,aging_factor,Days_Since_Last_Service,Warranty_Remaining_Days
0,4,58765,1,0,4,1,0,2000,28524,1,...,6,3,13.622204,1,1,2,1,8.000000,20506,-20507
1,5,60353,0,1,7,1,0,2500,133630,1,...,7,0,13.625307,1,1,2,1,18.520259,20506,-20507
2,0,68072,2,0,2,1,0,1500,34022,0,...,7,0,14.306302,1,0,2,1,2.828427,20506,-20507
3,0,60849,0,4,5,2,0,2500,81636,1,...,7,3,18.709467,1,2,1,1,11.180340,20506,-20507
4,0,45742,2,5,1,2,1,2000,97162,2,...,6,2,16.977482,0,0,2,1,1.000000,20506,-20507
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,1,77229,1,3,6,0,0,2500,97404,2,...,4,1,10.357298,1,0,2,1,14.696938,20506,-20507
49996,4,54742,1,4,4,0,1,800,13421,1,...,4,1,14.924022,2,1,0,1,8.000000,20506,-20507
49997,5,75601,2,0,4,1,0,1500,124851,0,...,10,0,19.859243,2,2,2,1,8.000000,20506,-20507
49998,1,66181,1,1,7,1,1,2500,103182,2,...,4,3,10.720555,1,0,2,1,18.520259,20506,-20507
