In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [5]:
df = pd.read_csv('vehicle_maintenance_data.csv')
df.head()

Unnamed: 0,Vehicle_Model,Mileage,Maintenance_History,Reported_Issues,Vehicle_Age,Fuel_Type,Transmission_Type,Engine_Size,Odometer_Reading,Last_Service_Date,Warranty_Expiry_Date,Owner_Type,Insurance_Premium,Service_History,Accident_History,Fuel_Efficiency,Tire_Condition,Brake_Condition,Battery_Status,Need_Maintenance
0,Truck,58765,Good,0,4,Electric,Automatic,2000,28524,2023-11-23,2025-06-24,Second,20782,6,3,13.622204,New,New,Weak,1
1,Van,60353,Average,1,7,Electric,Automatic,2500,133630,2023-09-21,2025-06-04,Second,23489,7,0,13.625307,New,New,Weak,1
2,Bus,68072,Poor,0,2,Electric,Automatic,1500,34022,2023-06-27,2025-04-27,First,17979,7,0,14.306302,New,Good,Weak,1
3,Bus,60849,Average,4,5,Petrol,Automatic,2500,81636,2023-08-24,2025-11-05,Second,6220,7,3,18.709467,New,Worn Out,New,1
4,Bus,45742,Poor,5,1,Petrol,Manual,2000,97162,2023-05-25,2025-09-14,Third,16446,6,2,16.977482,Good,Good,Weak,1


In [6]:
df.isna().sum()

Vehicle_Model           0
Mileage                 0
Maintenance_History     0
Reported_Issues         0
Vehicle_Age             0
Fuel_Type               0
Transmission_Type       0
Engine_Size             0
Odometer_Reading        0
Last_Service_Date       0
Warranty_Expiry_Date    0
Owner_Type              0
Insurance_Premium       0
Service_History         0
Accident_History        0
Fuel_Efficiency         0
Tire_Condition          0
Brake_Condition         0
Battery_Status          0
Need_Maintenance        0
dtype: int64

In [7]:
df.dtypes

Vehicle_Model            object
Mileage                   int64
Maintenance_History      object
Reported_Issues           int64
Vehicle_Age               int64
Fuel_Type                object
Transmission_Type        object
Engine_Size               int64
Odometer_Reading          int64
Last_Service_Date        object
Warranty_Expiry_Date     object
Owner_Type               object
Insurance_Premium         int64
Service_History           int64
Accident_History          int64
Fuel_Efficiency         float64
Tire_Condition           object
Brake_Condition          object
Battery_Status           object
Need_Maintenance          int64
dtype: object

In [9]:
non_num_cols = df.select_dtypes(include=['object']).columns
non_num_cols

Index(['Vehicle_Model', 'Maintenance_History', 'Fuel_Type',
       'Transmission_Type', 'Last_Service_Date', 'Warranty_Expiry_Date',
       'Owner_Type', 'Tire_Condition', 'Brake_Condition', 'Battery_Status'],
      dtype='object')

In [10]:
for col in non_num_cols:
  print(col , df[col].unique())
  print()

Vehicle_Model ['Truck' 'Van' 'Bus' 'Motorcycle' 'SUV' 'Car']

Maintenance_History ['Good' 'Average' 'Poor']

Fuel_Type ['Electric' 'Petrol' 'Diesel']

Transmission_Type ['Automatic' 'Manual']

Last_Service_Date ['2023-11-23' '2023-09-21' '2023-06-27' '2023-08-24' '2023-05-25'
 '2023-08-12' '2024-01-13' '2023-05-12' '2023-04-07' '2023-08-05'
 '2023-11-08' '2023-07-03' '2024-02-19' '2023-04-11' '2023-05-07'
 '2023-09-17' '2023-11-24' '2023-09-23' '2024-01-23' '2023-06-15'
 '2023-07-14' '2023-04-30' '2023-06-12' '2023-12-10' '2023-10-25'
 '2023-10-27' '2023-05-22' '2023-12-22' '2023-09-04' '2024-01-09'
 '2023-06-29' '2023-11-19' '2023-08-19' '2023-06-07' '2023-12-31'
 '2023-08-31' '2024-02-11' '2023-10-17' '2024-02-16' '2023-10-15'
 '2023-11-01' '2023-08-20' '2023-06-26' '2023-09-27' '2023-11-06'
 '2023-10-26' '2023-05-30' '2023-04-14' '2023-11-17' '2023-05-15'
 '2023-12-09' '2023-09-08' '2024-02-23' '2023-08-23' '2023-09-26'
 '2023-10-11' '2024-02-22' '2023-05-31' '2023-09-05' '2024-01-0

In [20]:
#Label encoding - Maintenance_History, Transmission_Type, Owner_Type, Tire_Condition, Brake_Condition, Battery_Status
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df['Maintenance_History']=le.fit_transform(df['Maintenance_History'])
df['Transmission_Type']=le.fit_transform(df['Transmission_Type'])
df['Owner_Type']=le.fit_transform(df['Owner_Type'])
df['Tire_Condition']=le.fit_transform(df['Tire_Condition'])
df['Brake_Condition']=le.fit_transform(df['Brake_Condition'])
df['Battery_Status']=le.fit_transform(df['Battery_Status'])
df.head(10)

Unnamed: 0,Vehicle_Model,Mileage,Maintenance_History,Reported_Issues,Vehicle_Age,Fuel_Type,Transmission_Type,Engine_Size,Odometer_Reading,Last_Service_Date,Warranty_Expiry_Date,Owner_Type,Insurance_Premium,Service_History,Accident_History,Fuel_Efficiency,Tire_Condition,Brake_Condition,Battery_Status,Need_Maintenance
0,Truck,58765,1,0,4,Electric,2,2000,28524,2023-11-23,2025-06-24,1,20782,6,3,13.622204,1,1,2,1
1,Van,60353,0,1,7,Electric,2,2500,133630,2023-09-21,2025-06-04,1,23489,7,0,13.625307,1,1,2,1
2,Bus,68072,2,0,2,Electric,2,1500,34022,2023-06-27,2025-04-27,0,17979,7,0,14.306302,1,0,2,1
3,Bus,60849,0,4,5,Petrol,1,2500,81636,2023-08-24,2025-11-05,1,6220,7,3,18.709467,1,2,1,1
4,Bus,45742,2,5,1,Petrol,2,2000,97162,2023-05-25,2025-09-14,2,16446,6,2,16.977482,0,0,2,1
5,Truck,31653,0,2,1,Diesel,1,800,70954,2023-08-12,2024-09-05,2,16813,5,3,15.954422,2,0,1,0
6,Motorcycle,51211,0,2,8,Diesel,1,2500,145563,2024-01-13,2025-07-20,1,21057,10,0,16.455703,1,0,1,0
7,Van,79093,2,2,2,Petrol,1,2000,132354,2023-05-12,2026-02-13,0,6498,3,1,12.128404,0,1,1,1
8,SUV,59673,1,2,6,Electric,2,800,85733,2023-04-07,2025-04-21,0,12787,9,1,11.558027,2,0,2,1
9,Bus,37001,1,2,9,Diesel,1,1500,8554,2023-08-05,2025-05-14,2,20860,9,1,12.787248,2,1,1,0


In [12]:
df['Last_Service_Date'] = pd.to_datetime(df['Last_Service_Date'])
df['Warranty_Expiry_Date'] = pd.to_datetime(df['Warranty_Expiry_Date'])

In [13]:
df.dtypes

Vehicle_Model                   object
Mileage                          int64
Maintenance_History             object
Reported_Issues                  int64
Vehicle_Age                      int64
Fuel_Type                       object
Transmission_Type               object
Engine_Size                      int64
Odometer_Reading                 int64
Last_Service_Date       datetime64[ns]
Warranty_Expiry_Date    datetime64[ns]
Owner_Type                      object
Insurance_Premium                int64
Service_History                  int64
Accident_History                 int64
Fuel_Efficiency                float64
Tire_Condition                  object
Brake_Condition                 object
Battery_Status                  object
Need_Maintenance                 int64
dtype: object

In [42]:
#One hot encoding - Vehicle_Model , Fuel_Type
from sklearn.preprocessing import OneHotEncoder
encoder=OneHotEncoder()
encoded=encoder.fit_transform(df[['Fuel_Type','Vehicle_Model']])

type(encoded)

scipy.sparse._csr.csr_matrix

In [None]:
#Features(X) - all except Need_Maintenance
#Target(y) - Need_Maintenance
X=df.drop(['Need_Maintenance'],axis=1)
y=df['Need_Maintenance']

In [46]:
# encode categorical columns
encoder=OneHotEncoder()
encoded=encoder.fit_transform(X[['Fuel_Type','Vehicle_Model']])
encoded_df=pd.DataFrame(
  encoded,
  columns=encoder.get_feature_names_out(['Fuel_Type', 'Vehicle_Model'])
)

X = X.drop(columns=['Fuel_Type'])
X = pd.concat([X.reset_index(drop=True), encoded_df], axis=1)

ValueError: Shape of passed values is (50000, 1), indices imply (50000, 9)

In [34]:
#Train test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [36]:
df

Unnamed: 0,Vehicle_Model,Mileage,Maintenance_History,Reported_Issues,Vehicle_Age,Fuel_Type,Transmission_Type,Engine_Size,Odometer_Reading,Last_Service_Date,Warranty_Expiry_Date,Owner_Type,Insurance_Premium,Service_History,Accident_History,Fuel_Efficiency,Tire_Condition,Brake_Condition,Battery_Status,Need_Maintenance
0,Truck,58765,1,0,4,Electric,2,2000,28524,2023-11-23,2025-06-24,1,20782,6,3,13.622204,1,1,2,1
1,Van,60353,0,1,7,Electric,2,2500,133630,2023-09-21,2025-06-04,1,23489,7,0,13.625307,1,1,2,1
2,Bus,68072,2,0,2,Electric,2,1500,34022,2023-06-27,2025-04-27,0,17979,7,0,14.306302,1,0,2,1
3,Bus,60849,0,4,5,Petrol,1,2500,81636,2023-08-24,2025-11-05,1,6220,7,3,18.709467,1,2,1,1
4,Bus,45742,2,5,1,Petrol,2,2000,97162,2023-05-25,2025-09-14,2,16446,6,2,16.977482,0,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,Car,77229,1,3,6,Diesel,2,2500,97404,2023-11-24,2025-09-15,2,5217,4,1,10.357298,1,0,2,1
49996,Truck,54742,1,4,4,Diesel,0,800,13421,2023-07-14,2025-01-21,1,15856,4,1,14.924022,2,1,0,1
49997,Van,75601,2,0,4,Electric,2,1500,124851,2023-09-09,2024-11-09,0,6019,10,0,19.859243,2,2,2,1
49998,Car,66181,1,1,7,Electric,2,2500,103182,2023-08-26,2025-05-22,2,5030,4,3,10.720555,1,0,2,1


In [35]:
#import algo - logistic regression
#Train model (.fit())
#Predict on test set
#evaluate - classification report, confusion matrix, accuracy score

model=LogisticRegression()
model.fit(X_train,y_train)

ValueError: could not convert string to float: 'Car'

In [None]:
#import algo - decision tree

#Train model (.fit())

#Predict on test set

#evaluate - classification report, confusion matrix, accuracy score

In [None]:
#Save the model using joblib or pickle
#joblib.dump(logistic_model, 'logistic_model.pkl')