In [2]:
import os
import json
import pandas as pd
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import KFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')
directory = '/content/gdrive/My Drive/mot_data/'

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
code_df = pd.read_pickle('/content/gdrive/My Drive/17jul11.pkl') # Load

In [None]:
# CONVERTING KM TO MILES
# This condition finds rows where 'odometerUnit' is 'km'
condition = (code_df['odometerUnit'] == 'km')

# This line does the multiplication where the condition is True
code_df.loc[condition, 'odometerValue'] = (code_df.loc[condition, 'odometerValue'].astype(int) * 1.6).astype(int)

In [None]:
code_df.drop(columns=['vehicleId','primaryColour','registration','odometerResultType','motTestNumber','odometerUnit'], inplace=True)

In [None]:
code_df['completedDate'] = pd.to_datetime(code_df['completedDate'], errors='coerce')
code_df = code_df.dropna(subset=['completedDate'])
code_df['completedDate'] = code_df['completedDate'].fillna(pd.Timestamp.min)

In [None]:
# Convert registrationDate and completedDate to datetime objects if they aren't already
code_df['manufactureDate'] = pd.to_datetime(code_df['manufactureDate'])
code_df['completedDate'] = pd.to_datetime(code_df['completedDate'])


# Calculate vehicle age at the time of test
code_df['vehicle_age'] = (code_df['completedDate'] - code_df['manufactureDate']).dt.days / 365

In [None]:
cols = ['testResult','odometerValue','make','model','fuelType','engineSize','vehicle_age']
code_df[cols] = code_df[cols].replace({'':np.nan, ' ':np.nan})

# Now, you can drop the NaN values
code_df.dropna(subset=['testResult','odometerValue','make','model','fuelType','engineSize','vehicle_age'], inplace=True)

In [None]:
print(len(code_df))

3722650


In [None]:
code_df.drop(columns=['registrationDate','completedDate', 'manufactureDate','firstUsedDate','expiryDate'], inplace=True)

In [None]:
code_df.to_pickle('/content/gdrive/My Drive/17jul22.pkl') # Save

In [None]:
code_df = pd.read_pickle('/content/gdrive/My Drive/17jul22.pkl') # Load

In [None]:
code_df = code_df[code_df['rfrAndComments'].str.len() != 0]


In [None]:
code_df.drop(columns=['testResult'], inplace=True)

In [None]:
len(code_df)

2131546

In [None]:
# Reduce the dataframe to 80% of its original size
code_df = code_df.sample(frac=0.8, random_state=42)

In [None]:
len(code_df)

1705237

In [None]:
code_df.to_pickle('/content/gdrive/My Drive/18jul1.pkl') # Save

In [None]:
code_df = pd.read_pickle('/content/gdrive/My Drive/18jul1.pkl') # Load

In [None]:
code_df.head()

Unnamed: 0,odometerValue,rfrAndComments,make,model,fuelType,engineSize,vehicle_age
2329843,135147,[{'text': 'Offside Front Track rod end ball jo...,MERCEDES-BENZ,E,Petrol,2398,11.890411
110934,9316,[{'text': 'Parking brake: parking brake effici...,MERCEDES-BENZ,S-Class,Petrol,5461,3.972603
2662876,30481,[{'text': 'Offside Front Tyre worn close to th...,HYUNDAI,MATRIX,Petrol,1599,8.016438
1174241,105032,"[{'text': 'Oil leak, but not excessive (8.4.1 ...",MERCEDES-BENZ,CLK,Petrol,3199,17.80274
2255716,73052,[{'text': 'Rear registration plate deteriorate...,FORD,GALAXY,Diesel,1896,7.772603


In [None]:
tqdm.pandas()

def expand_r(row):
    # If 'rfrAndComments' is empty, create a DataFrame with NaN values
    if not row['rfrAndComments']:
        rfr = pd.DataFrame([{"text": np.nan, "type": np.nan, "dangerous": np.nan}])
    else:
        rfr = pd.json_normalize(row['rfrAndComments'])

    # Add other columns from the row to the DataFrame
    for col in row.index:
        if col != 'rfrAndComments':
            rfr[col] = row[col]
    return rfr

frames = code_df.progress_apply(expand_r, axis=1)

100%|██████████| 1705237/1705237 [47:20<00:00, 600.25it/s]


In [None]:
final_code_df = pd.concat(frames.values, ignore_index=True)

In [None]:
frames = "ok"

In [None]:
print("ok")

ok


In [None]:
chunk_size = 10000  # Adjust this value as necessary based on your system's memory

# Create an empty DataFrame to store the results
final_code_df = pd.DataFrame()

for i in range(0, code_df.shape[0], chunk_size):
    chunk = code_df.iloc[i:i+chunk_size, :]
    frames = chunk.apply(expand_r, axis=1)
    final_code_df = pd.concat([final_code_df, frames])

# Reset the index of the result DataFrame
final_code_df.reset_index(drop=True, inplace=True)

In [None]:
import pickle

In [None]:
with open('/content/gdrive/My Drive/frames.pkl', 'wb') as f:
    pickle.dump(frames, f)

In [None]:
final_code_df.head()

Unnamed: 0,text,type,dangerous,odometerValue,make,model,fuelType,engineSize,vehicle_age
0,Offside Front Track rod end ball joint has sli...,ADVISORY,False,135147,MERCEDES-BENZ,E,Petrol,2398,11.890411
1,Oil leak,USER ENTERED,False,135147,MERCEDES-BENZ,E,Petrol,2398,11.890411
2,Parking brake: parking brake efficiency only j...,ADVISORY,False,9316,MERCEDES-BENZ,S-Class,Petrol,5461,3.972603
3,Offside Front Tyre worn close to the legal lim...,ADVISORY,False,30481,HYUNDAI,MATRIX,Petrol,1599,8.016438
4,"Oil leak, but not excessive (8.4.1 (a) (i))",ADVISORY,False,105032,MERCEDES-BENZ,CLK,Petrol,3199,17.80274


In [None]:
final_code_df.to_pickle('/content/gdrive/My Drive/18jul2.pkl') # Save

In [None]:
final_code_df = pd.read_pickle('/content/gdrive/My Drive/18jul2.pkl') # Load

In [None]:
print(len(final_code_df))

6251453


In [None]:
final_code_df.drop(columns=['type','dangerous'], inplace=True)

In [None]:
# Extract fault codes into a separate series
fault_code_series = final_code_df['text'].str.extract(r'(\d+\.\d+(\.\d+)?(\s\(\w+\)\s\(\w+\))?)')[0]

In [None]:
# Add the series to the dataframe
final_code_df['fault_code'] = fault_code_series

# Remove the text column from the dataframe
final_code_df.drop(columns=['text'], inplace=True)

In [None]:
final_code_df.head()

Unnamed: 0,odometerValue,make,model,fuelType,engineSize,vehicle_age,fault_code
0,135147,MERCEDES-BENZ,E,Petrol,2398,11.890411,2.2
1,135147,MERCEDES-BENZ,E,Petrol,2398,11.890411,
2,9316,MERCEDES-BENZ,S-Class,Petrol,5461,3.972603,3.7
3,30481,HYUNDAI,MATRIX,Petrol,1599,8.016438,4.1
4,105032,MERCEDES-BENZ,CLK,Petrol,3199,17.80274,8.4.1 (a) (i)


In [None]:
final_code_df.to_pickle('/content/gdrive/My Drive/18jul3.pkl') # Save

In [None]:
final_code_df = pd.read_pickle('/content/gdrive/My Drive/18jul3.pkl') # Load

In [None]:
final_code_df.head()

Unnamed: 0,odometerValue,make,model,fuelType,engineSize,vehicle_age,fault_code
0,135147,MERCEDES-BENZ,E,Petrol,2398,11.890411,2.2
1,135147,MERCEDES-BENZ,E,Petrol,2398,11.890411,
2,9316,MERCEDES-BENZ,S-Class,Petrol,5461,3.972603,3.7
3,30481,HYUNDAI,MATRIX,Petrol,1599,8.016438,4.1
4,105032,MERCEDES-BENZ,CLK,Petrol,3199,17.80274,8.4.1 (a) (i)


In [None]:
final_code_df['fault_code'] = final_code_df['fault_code'].str.split(' ', 1).str[0]

  final_code_df['fault_code'] = final_code_df['fault_code'].str.split(' ', 1).str[0]


TypeError: ignored

In [None]:
final_code_df.head()

Unnamed: 0,odometerValue,make,model,fuelType,engineSize,vehicle_age,fault_code
0,135147,MERCEDES-BENZ,E,Petrol,2398,11.890411,2.2
1,135147,MERCEDES-BENZ,E,Petrol,2398,11.890411,
2,9316,MERCEDES-BENZ,S-Class,Petrol,5461,3.972603,3.7
3,30481,HYUNDAI,MATRIX,Petrol,1599,8.016438,4.1
4,105032,MERCEDES-BENZ,CLK,Petrol,3199,17.80274,8.4.1


In [None]:
len(final_code_df)

6251453

In [None]:
# First, get the counts for each value in 'fault_code'
counts = final_code_df['fault_code'].value_counts()

# Create a mask for the values that occur three or more times
mask = final_code_df['fault_code'].isin(counts[counts >= 100].index)

# Use this mask to select only the rows that meet this condition
#final_code_df_filtered = final_code_df[mask]
final_code_df = final_code_df[mask]

In [None]:
# Create the conditions
condition1 = final_code_df['fault_code'].str.startswith('1.6')
condition2 = final_code_df['fault_code'].str.startswith('3')
condition3 = final_code_df['fault_code'].str.startswith('4') & ~final_code_df['fault_code'].isin(['4.4.3', '4.7', '4.7.1', '4.7.2'])
condition4 = final_code_df['fault_code'].str.startswith('5.2')
condition5 = final_code_df['fault_code'].str.startswith('8') & ~final_code_df['fault_code'].isin(['8.4', '8.4.1'])
condition6 = final_code_df['fault_code'].str.startswith('0')
condition7 = final_code_df['fault_code'].isin(['1.1.3', '1.1.4', '1.1.6', '1.1.9', '1.1.10', '1.1.19'])

# Combine the conditions
mask = condition1 | condition2 | condition3 | condition4 | condition5 | condition6 | condition7

# Drop the rows that satisfy any of the conditions
final_code_df_filtered = final_code_df[~mask]

In [None]:
final_code_df = "ok"

In [None]:
final_code_df_filtered.head()

Unnamed: 0,odometerValue,make,model,fuelType,engineSize,vehicle_age,fault_code
0,135147,MERCEDES-BENZ,E,Petrol,2398,11.890411,2.2
4,105032,MERCEDES-BENZ,CLK,Petrol,3199,17.80274,8.4.1
5,105032,MERCEDES-BENZ,CLK,Petrol,3199,17.80274,1.1.11
6,105032,MERCEDES-BENZ,CLK,Petrol,3199,17.80274,1.1.11
7,105032,MERCEDES-BENZ,CLK,Petrol,3199,17.80274,5.3.1


In [None]:
final_code_df_filtered = final_code_df

In [None]:
len(final_code_df_filtered)

2730176

In [None]:
len(final_code_df_filtered)

2730176

In [None]:
final_code_df_filtered.head()

Unnamed: 0,odometerValue,make,model,fuelType,engineSize,vehicle_age,fault_code
0,135147,MERCEDES-BENZ,E,Petrol,2398,11.890411,2.2
2,9316,MERCEDES-BENZ,S-Class,Petrol,5461,3.972603,3.7
3,30481,HYUNDAI,MATRIX,Petrol,1599,8.016438,4.1
4,105032,MERCEDES-BENZ,CLK,Petrol,3199,17.80274,8.4.1
5,105032,MERCEDES-BENZ,CLK,Petrol,3199,17.80274,1.1.11


In [None]:
# V4

import re

def remove_second_decimal(s):
    # Remove the second decimal point and everything that follows
    return re.sub(r'(\d+\.\d+)\..*', r'\1', s)

final_code_df_filtered['fault_code'] = final_code_df_filtered['fault_code'].apply(remove_second_decimal)

In [None]:
final_code_df_filtered.head()

Unnamed: 0,odometerValue,make,model,fuelType,engineSize,vehicle_age,fault_code
0,135147,MERCEDES-BENZ,E,Petrol,2398,11.890411,2.2
4,105032,MERCEDES-BENZ,CLK,Petrol,3199,17.80274,8.4
5,105032,MERCEDES-BENZ,CLK,Petrol,3199,17.80274,1.1
6,105032,MERCEDES-BENZ,CLK,Petrol,3199,17.80274,1.1
7,105032,MERCEDES-BENZ,CLK,Petrol,3199,17.80274,5.3


In [None]:
code_mapping = {code: i for i, code in enumerate(final_code_df_filtered['fault_code'].unique())}
code_mapping

{'2.2': 0,
 '8.4': 1,
 '1.1': 2,
 '5.3': 3,
 '6.3': 4,
 '7.1': 5,
 '2.3': 6,
 '1.8': 7,
 '2.5': 8,
 '2.1': 9,
 '2.4': 10,
 '4.4': 11,
 '1.5': 12,
 '4.7': 13,
 '1.7': 14,
 '2.6': 15,
 '1.4': 16,
 '2.7': 17,
 '6.1': 18,
 '5.1': 19,
 '7.2': 20,
 '1.2': 21,
 '1.3': 22,
 '7.7': 23,
 '6.2': 24,
 '5.4': 25,
 '7.3': 26,
 '1.9': 27,
 '1.0': 28,
 '7.4': 29,
 '2.0': 30,
 '6.6': 31,
 '7.8': 32,
 '7.12': 33,
 '7.0': 34,
 '6.4': 35,
 '2.8': 36,
 '2.9': 37,
 '6.7': 38}

In [None]:
code_mapping = {code: i for i, code in enumerate(final_code_df_filtered['fault_code'].unique())}
code_mapping

{'2.2': 0,
 '8.4.1': 1,
 '1.1.11': 2,
 '5.3.1': 3,
 '1.1.12': 4,
 '5.3.3': 5,
 '5.3.6': 6,
 '6.3.1': 7,
 '7.1.1': 8,
 '2.3.3': 9,
 '1.1': 10,
 '1.8': 11,
 '2.5': 12,
 '2.1.2': 13,
 '5.3.4': 14,
 '1.1.14': 15,
 '7.1.2': 16,
 '2.4': 17,
 '4.4.3': 18,
 '1.5': 19,
 '4.7.1': 20,
 '1.7.5': 21,
 '2.6.2': 22,
 '1.4': 23,
 '1.4.1': 24,
 '1.4.2': 25,
 '2.1.3': 26,
 '2.7.3': 27,
 '1.7.2': 28,
 '1.1.13': 29,
 '6.1.2': 30,
 '1.5.1': 31,
 '5.1.3': 32,
 '7.2.3': 33,
 '1.2.1': 34,
 '1.3.2': 35,
 '7.1.6': 36,
 '6.1.3': 37,
 '7.7': 38,
 '5.3.2': 39,
 '6.2.2': 40,
 '6.1.1': 41,
 '2.7.2': 42,
 '2.7.4': 43,
 '2.2.2': 44,
 '6.1': 45,
 '5.1.2': 46,
 '6.1.8': 47,
 '6.2': 48,
 '7.2.4': 49,
 '2.3.2': 50,
 '1.2.4': 51,
 '6.2.3': 52,
 '1.1.2': 53,
 '6.3.2': 54,
 '1.1.5': 55,
 '1.3.1': 56,
 '1.7.3': 57,
 '6.1.7': 58,
 '5.4.2': 59,
 '1.2.2': 60,
 '7.3': 61,
 '2.3': 62,
 '1.9.1': 63,
 '4.7.2': 64,
 '1.0': 65,
 '6.2.1': 66,
 '2.3.6': 67,
 '7.4': 68,
 '7.1.4': 69,
 '6.3.4': 70,
 '2.0': 71,
 '2.7.5': 72,
 '6.6.1': 73,


In [None]:
final_code_df_filtered['fault_code'] = final_code_df_filtered['fault_code'].map(code_mapping)

In [None]:
final_code_df_filtered.head()

Unnamed: 0,odometerValue,make,model,fuelType,engineSize,vehicle_age,fault_code
0,135147,MERCEDES-BENZ,E,Petrol,2398,11.890411,0
4,105032,MERCEDES-BENZ,CLK,Petrol,3199,17.80274,1
5,105032,MERCEDES-BENZ,CLK,Petrol,3199,17.80274,2
6,105032,MERCEDES-BENZ,CLK,Petrol,3199,17.80274,2
7,105032,MERCEDES-BENZ,CLK,Petrol,3199,17.80274,3


In [None]:
final_code_df_filtered = final_code_df_filtered.drop_duplicates()

In [None]:
print(len(final_code_df_filtered))

1784053


In [None]:
final_code_df_filtered = "ok"

In [None]:
final_code_df_filtered.to_pickle('/content/gdrive/My Drive/21jul1.pkl') # Save

In [4]:
final_code_df = pd.read_pickle('/content/gdrive/My Drive/21jul1.pkl') # Load

In [None]:
final_code_df_filtered.to_pickle('/content/gdrive/My Drive/20jul2.pkl') # Save

In [None]:
final_code_df = pd.read_pickle('/content/gdrive/My Drive/20jul2.pkl') # Load

In [None]:
final_code_df_filtered.to_pickle('/content/gdrive/My Drive/20jul1.pkl') # Save

In [None]:
final_code_df = pd.read_pickle('/content/gdrive/My Drive/20jul1.pkl') # Load

In [None]:
final_code_df_filtered.to_pickle('/content/gdrive/My Drive/18jul4.pkl') # Save

In [None]:
final_code_df = pd.read_pickle('/content/gdrive/My Drive/18jul4.pkl') # Load

In [5]:
#9th Aug
final_code_df.head()

Unnamed: 0,odometerValue,make,model,fuelType,engineSize,vehicle_age,fault_code
0,135147,MERCEDES-BENZ,E,Petrol,2398,11.890411,0
4,105032,MERCEDES-BENZ,CLK,Petrol,3199,17.80274,1
5,105032,MERCEDES-BENZ,CLK,Petrol,3199,17.80274,2
7,105032,MERCEDES-BENZ,CLK,Petrol,3199,17.80274,3
13,73052,FORD,GALAXY,Diesel,1896,7.772603,4


In [None]:
final_code_df.head()

Unnamed: 0,odometerValue,make,model,fuelType,engineSize,vehicle_age,fault_code
0,135147,MERCEDES-BENZ,E,Petrol,2398,11.890411,0
4,105032,MERCEDES-BENZ,CLK,Petrol,3199,17.80274,1
5,105032,MERCEDES-BENZ,CLK,Petrol,3199,17.80274,2
7,105032,MERCEDES-BENZ,CLK,Petrol,3199,17.80274,3
13,73052,FORD,GALAXY,Diesel,1896,7.772603,4


In [None]:
len(final_code_df)

1784053

In [5]:
final_code_df.drop(columns=['make','fuelType','engineSize'], inplace=True)

In [6]:
# TEMP
model_counts = final_code_df['model'].value_counts()

# Get the makes with frequency less than 3
model_less_than_three = model_counts[model_counts < 50]

print(model_less_than_three)

SMART PASSION SOFTOUCH(RHD)A    49
SENTRA                          49
412                             49
PONTIAC                         49
RX400 H SE CVT                  49
                                ..
MG BT                            1
SLIVER CLOUD                     1
23A                              1
YN50                             1
OPEN TOURER                      1
Name: model, Length: 6372, dtype: int64


In [7]:
# Get a boolean mask of rows with makes that have a count less than 3
mask2 = final_code_df['model'].isin(model_less_than_three.index)

# Use ~ to negate the mask, selecting only rows with makes that have a count of 3 or more
result_df2 = final_code_df[~mask2]

In [None]:
final_code_df.head()

Unnamed: 0,odometerValue,model,vehicle_age,fault_code
0,135147,E,11.890411,0
4,105032,CLK,17.80274,1
5,105032,CLK,17.80274,2
7,105032,CLK,17.80274,3
13,73052,GALAXY,7.772603,4


In [None]:
# Reduce the dataframe to 80% of its original size
final_code_df = final_code_df.sample(frac=0.1, random_state=42)

In [None]:
len(final_code_df)

188287

In [13]:
#9th aug
final_code_df.head(10)

Unnamed: 0,odometerValue,model,vehicle_age,fault_code
0,135147,E,11.890411,0
4,105032,CLK,17.80274,1
5,105032,CLK,17.80274,2
7,105032,CLK,17.80274,3
13,73052,GALAXY,7.772603,4
14,73052,GALAXY,7.772603,5
15,112532,CIVIC,18.69589,6
20,161820,E,13.808219,2
21,161820,E,13.808219,7
22,161820,E,13.808219,0


In [None]:
final_code_df.head(10)

Unnamed: 0,odometerValue,model,vehicle_age,fault_code
2099756,59044,SHARAN,12.630137,39
6207435,76905,DISCOVERY,15.717808,2
5365797,140910,TRAJET,12.29589,17
2387174,131296,RANGE ROVER,10.252055,10
1138468,55445,C,7.189041,21
4827970,90946,PASSAT,11.367123,12
550510,121808,MINI,17.873973,30
1585327,156037,E,18.547945,3
5443268,58232,316,17.950685,4
126211,71176,CHEROKEE,8.991781,17


In [8]:
!pip install category_encoders



In [9]:
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from category_encoders import TargetEncoder, BinaryEncoder
from sklearn.pipeline import Pipeline

In [10]:
# specify columns to be preprocessed and their corresponding preprocessing methods
num_attribs = ['odometerValue', 'engineSize', 'vehicle_age']
cat_attribs_high_cardinality = ['make', 'model']
cat_attribs_low_cardinality = ['fuelType']

num_pipeline = Pipeline([
    ('robust_scaler', RobustScaler())
])

cat_high_card_pipeline = Pipeline([
    ('target_encoder', TargetEncoder())
])

cat_low_card_pipeline = Pipeline([
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat_high_card", cat_high_card_pipeline, cat_attribs_high_cardinality),
    ("cat_low_card", cat_low_card_pipeline, cat_attribs_low_cardinality)
])

In [11]:
#V2

# specify columns to be preprocessed and their corresponding preprocessing methods
num_attribs = ['odometerValue', 'vehicle_age']
cat_attribs_high_cardinality = ['model']

num_pipeline = Pipeline([
    ('robust_scaler', RobustScaler())
])

cat_high_card_pipeline = Pipeline([
    ('target_encoder', TargetEncoder())
])

preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat_high_card", cat_high_card_pipeline, cat_attribs_high_cardinality)
])

In [13]:
def cross_val_score_with_progress(pipeline, X, y, cv=5, scoring='accuracy'):
    kf = KFold(n_splits=cv)
    scores = []

    for i, (train_index, val_index) in enumerate(tqdm(kf.split(X), total=cv, desc='Cross-validation')):
        print(f"Starting fold {i+1}")
        train_X, val_X = X.iloc[train_index], X.iloc[val_index]
        train_y, val_y = y.iloc[train_index], y.iloc[val_index]

        pipeline.fit(train_X, train_y)

        if scoring == 'accuracy':
            pred_y = pipeline.predict(val_X)
            score = accuracy_score(val_y, pred_y)
        else:
            # Other scoring methods can be added here
            pass

        print(f"Finished fold {i+1}, score: {score}")
        scores.append(score)

    return np.array(scores)

In [None]:
final_code_df.head(20)

Unnamed: 0,odometerValue,make,model,fuelType,engineSize,vehicle_age,fault_code
0,135147,MERCEDES-BENZ,E,Petrol,2398,11.890411,0
2,9316,MERCEDES-BENZ,S-Class,Petrol,5461,3.972603,1
3,30481,HYUNDAI,MATRIX,Petrol,1599,8.016438,2
4,105032,MERCEDES-BENZ,CLK,Petrol,3199,17.80274,3
5,105032,MERCEDES-BENZ,CLK,Petrol,3199,17.80274,4
6,105032,MERCEDES-BENZ,CLK,Petrol,3199,17.80274,4
7,105032,MERCEDES-BENZ,CLK,Petrol,3199,17.80274,5
8,105032,MERCEDES-BENZ,CLK,Petrol,3199,17.80274,6
9,105032,MERCEDES-BENZ,CLK,Petrol,3199,17.80274,6
10,105032,MERCEDES-BENZ,CLK,Petrol,3199,17.80274,7


In [14]:
#9th aug

from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

# Split the dataset for fault type prediction
train_set, val_set, target_train_set, target_val_set = train_test_split(final_code_df.drop('fault_code', axis=1), final_code_df['fault_code'], test_size=0.2, random_state=43)

# Fit and score the pipeline on the fault type prediction problem
pipeline_result = Pipeline([('preprocessor', preprocessor), ('classifier', RandomForestClassifier())])
# Cross-validation
cv_scores_type = cross_val_score_with_progress(pipeline_result, train_set, target_train_set.squeeze())
mean_score_type = np.mean(cv_scores_type)

print(f'L1 regularisation for fault type prediction: Mean cross-validation accuracy = {mean_score_type:.4f}')

Cross-validation:   0%|          | 0/5 [00:00<?, ?it/s]

Starting fold 1


Cross-validation:   0%|          | 0/5 [02:09<?, ?it/s]


KeyboardInterrupt: ignored

In [None]:
#V5

from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

# Split the dataset for fault type prediction
train_set, val_set, target_train_set, target_val_set = train_test_split(final_code_df.drop('fault_code', axis=1), final_code_df['fault_code'], test_size=0.2, random_state=43)

# Fit and score the pipeline on the fault type prediction problem
pipeline_result = Pipeline([('preprocessor', preprocessor), ('classifier', LinearSVC(penalty='l2', dual=False, C=1.0))])
# Cross-validation
cv_scores_type = cross_val_score_with_progress(pipeline_result, train_set, target_train_set.squeeze())
mean_score_type = np.mean(cv_scores_type)

print(f'L1 regularisation for fault type prediction: Mean cross-validation accuracy = {mean_score_type:.4f}')

Cross-validation:   0%|          | 0/5 [00:00<?, ?it/s]

Starting fold 1


Cross-validation:  20%|██        | 1/5 [05:08<20:33, 308.34s/it]

Finished fold 1, score: 0.16277863996720957
Starting fold 2


Cross-validation:  40%|████      | 2/5 [10:29<15:47, 315.80s/it]

Finished fold 2, score: 0.16314648150808025
Starting fold 3


Cross-validation:  60%|██████    | 3/5 [15:41<10:28, 314.13s/it]

Finished fold 3, score: 0.16498626720103135
Starting fold 4


Cross-validation:  80%|████████  | 4/5 [20:54<05:13, 313.58s/it]

Finished fold 4, score: 0.16444676438440628
Starting fold 5


Cross-validation: 100%|██████████| 5/5 [26:09<00:00, 313.92s/it]

Finished fold 5, score: 0.16506684229702082
L1 regularisation for fault type prediction: Mean cross-validation accuracy = 0.1641





In [None]:
import pickle

# Save the model
with open('/content/gdrive/My Drive/code_model5.pkl', 'wb') as f:
    pickle.dump(pipeline_result, f)

In [None]:
#V4

from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

# Split the dataset for fault type prediction
train_set, val_set, target_train_set, target_val_set = train_test_split(final_code_df.drop('fault_code', axis=1), final_code_df['fault_code'], test_size=0.2, random_state=43)

# Fit and score the pipeline on the fault type prediction problem
pipeline_result = Pipeline([('preprocessor', preprocessor), ('classifier', LogisticRegression(penalty='l2', solver='saga', max_iter=2000))])
# Cross-validation
cv_scores_type = cross_val_score_with_progress(pipeline_result, train_set, target_train_set.squeeze())
mean_score_type = np.mean(cv_scores_type)

print(f'L1 regularisation for fault type prediction: Mean cross-validation accuracy = {mean_score_type:.4f}')

Cross-validation:   0%|          | 0/5 [00:00<?, ?it/s]

Starting fold 1


Cross-validation:  20%|██        | 1/5 [1:05:39<4:22:36, 3939.10s/it]

Finished fold 1, score: 0.16223563578782899
Starting fold 2


Cross-validation:  40%|████      | 2/5 [2:07:02<3:09:26, 3788.83s/it]

Finished fold 2, score: 0.16312546199145908
Starting fold 3


Cross-validation:  60%|██████    | 3/5 [3:16:29<2:12:02, 3961.34s/it]

Finished fold 3, score: 0.16494422802051512
Starting fold 4


Cross-validation:  80%|████████  | 4/5 [4:25:24<1:07:10, 4030.03s/it]

Finished fold 4, score: 0.16469199293741768
Starting fold 5


Cross-validation: 100%|██████████| 5/5 [5:33:11<00:00, 3998.31s/it]

Finished fold 5, score: 0.16477957456349318
L1 regularisation for fault type prediction: Mean cross-validation accuracy = 0.1640





In [None]:
import pickle

# Save the model
with open('/content/gdrive/My Drive/code_model4.pkl', 'wb') as f:
    pickle.dump(pipeline_result, f)

In [None]:
train_set, val_set, target_train_set, target_val_set = train_test_split(final_code_df.drop('fault_code', axis=1), final_code_df['fault_code'], test_size=0.2, random_state=43)

In [None]:
import pickle
# Load the pickled model
with open('/content/gdrive/My Drive/code_model4.pkl', 'rb') as f:
    pipeline_result = pickle.load(f)

In [None]:
#V3

from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

# Split the dataset for fault type prediction
train_set, val_set, target_train_set, target_val_set = train_test_split(final_code_df.drop('fault_code', axis=1), final_code_df['fault_code'], test_size=0.2, random_state=43)

# Fit and score the pipeline on the fault type prediction problem
pipeline_result = Pipeline([('preprocessor', preprocessor), ('classifier', LogisticRegression(penalty='l2', solver='saga', max_iter=2000))])
# Cross-validation
cv_scores_type = cross_val_score_with_progress(pipeline_result, train_set, target_train_set.squeeze())
mean_score_type = np.mean(cv_scores_type)

print(f'L1 regularisation for fault type prediction: Mean cross-validation accuracy = {mean_score_type:.4f}')

Cross-validation:   0%|          | 0/5 [00:00<?, ?it/s]

Starting fold 1


Cross-validation:  20%|██        | 1/5 [3:02:32<12:10:10, 10952.64s/it]

Finished fold 1, score: 0.14109453991416024
Starting fold 2


Cross-validation:  40%|████      | 2/5 [4:44:21<6:45:09, 8103.09s/it]  

Finished fold 2, score: 0.14076923842939132
Starting fold 3


Cross-validation:  60%|██████    | 3/5 [6:38:04<4:10:37, 7518.97s/it]

Finished fold 3, score: 0.14015846829472314
Starting fold 4


Cross-validation:  80%|████████  | 4/5 [8:42:13<2:04:51, 7491.01s/it]

Finished fold 4, score: 0.14117088618099377
Starting fold 5


Cross-validation: 100%|██████████| 5/5 [10:44:03<00:00, 7728.73s/it]

Finished fold 5, score: 0.14156636504258807
L1 regularisation for fault type prediction: Mean cross-validation accuracy = 0.1410





In [None]:
import pickle

# Save the model
with open('/content/gdrive/My Drive/code_model3.pkl', 'wb') as f:
    pickle.dump(pipeline_result, f)

In [None]:
#V2

from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

# Split the dataset for fault type prediction
train_set, val_set, target_train_set, target_val_set = train_test_split(final_code_df.drop('fault_code', axis=1), final_code_df['fault_code'], test_size=0.2, random_state=43)

# Fit and score the pipeline on the fault type prediction problem
pipeline_result = Pipeline([('preprocessor', preprocessor), ('classifier', LogisticRegression(penalty='l2', solver='saga', max_iter=2000))])
# Cross-validation
cv_scores_type = cross_val_score_with_progress(pipeline_result, train_set, target_train_set.squeeze())
mean_score_type = np.mean(cv_scores_type)

print(f'L1 regularisation for fault type prediction: Mean cross-validation accuracy = {mean_score_type:.4f}')

Cross-validation:   0%|          | 0/5 [00:00<?, ?it/s]

Starting fold 1


Cross-validation:  20%|██        | 1/5 [20:15<1:21:03, 1215.92s/it]

Finished fold 1, score: 0.14080860386377217
Starting fold 2


Cross-validation:  40%|████      | 2/5 [31:53<45:32, 910.86s/it]   

Finished fold 2, score: 0.1398791741352984
Starting fold 3


Cross-validation:  60%|██████    | 3/5 [43:24<27:01, 810.60s/it]

Finished fold 3, score: 0.1416384518356237
Starting fold 4


Cross-validation:  80%|████████  | 4/5 [58:25<14:06, 846.28s/it]

Finished fold 4, score: 0.1386509991369581
Starting fold 5


Cross-validation: 100%|██████████| 5/5 [1:10:30<00:00, 846.16s/it]

Finished fold 5, score: 0.13775933609958507
L1 regularisation for fault type prediction: Mean cross-validation accuracy = 0.1397





In [None]:
import pickle

# Save the model
with open('/content/gdrive/My Drive/code_model2.pkl', 'wb') as f:
    pickle.dump(pipeline_result, f)

In [None]:
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

# Split the dataset for fault type prediction
train_set, val_set, target_train_set, target_val_set = train_test_split(final_code_df.drop('fault_code', axis=1), final_code_df['fault_code'], test_size=0.2, random_state=43)

# Fit and score the pipeline on the fault type prediction problem
pipeline_result = Pipeline([('preprocessor', preprocessor), ('classifier', LogisticRegression(penalty='l2', solver='saga', max_iter=2000))])
# Cross-validation
cv_scores_type = cross_val_score_with_progress(pipeline_result, train_set, target_train_set.squeeze())
mean_score_type = np.mean(cv_scores_type)

print(f'L1 regularisation for fault type prediction: Mean cross-validation accuracy = {mean_score_type:.4f}')

Cross-validation:   0%|          | 0/5 [00:00<?, ?it/s]

Starting fold 1


Cross-validation:  20%|██        | 1/5 [2:11:00<8:44:01, 7860.34s/it]

Finished fold 1, score: 0.1220686872762116
Starting fold 2


Cross-validation:  40%|████      | 2/5 [4:21:01<6:31:16, 7825.63s/it]

Finished fold 2, score: 0.12403550355539866
Starting fold 3


Cross-validation:  60%|██████    | 3/5 [6:35:34<4:24:36, 7938.35s/it]

Finished fold 3, score: 0.1222704120227949
Starting fold 4


Cross-validation:  80%|████████  | 4/5 [8:44:10<2:10:50, 7850.74s/it]

Finished fold 4, score: 0.12280303603399148
Starting fold 5


Cross-validation: 100%|██████████| 5/5 [10:49:38<00:00, 7795.72s/it]

Finished fold 5, score: 0.12212219784653403
L1 regularisation for fault type prediction: Mean cross-validation accuracy = 0.1227





In [None]:
import pickle

# Save the model
with open('/content/gdrive/My Drive/code_model.pkl', 'wb') as f:
    pickle.dump(pipeline_result, f)

In [None]:
import pickle
# Load the pickled model
with open('/content/gdrive/My Drive/code_model.pkl', 'rb') as f:
    pipeline_result = pickle.load(f)

In [None]:
train_set, val_set, target_train_set, target_val_set = train_test_split(final_code_df.drop('fault_code', axis=1), final_code_df['fault_code'], test_size=0.2, random_state=43)

In [None]:
from tqdm import tqdm

class CustomAccuracyOld:
    def __init__(self, X, y):
        self.X = X.copy()
        self.y = y.copy()
        self.X['fault_code'] = self.y

    def score(self, y_true, y_pred):
        correct = 0
        for i, val in tqdm(enumerate(y_pred), total=len(y_pred), desc='Calculating Accuracy'):
            matching_vehicles = self.X[
                (self.X['make'] == self.X.iloc[i]['make']) &
                (self.X['model'] == self.X.iloc[i]['model']) &
                (self.X['odometerValue'] == self.X.iloc[i]['odometerValue']) &
                (self.X['fuelType'] == self.X.iloc[i]['fuelType']) &
                (self.X['engineSize'] == self.X.iloc[i]['engineSize']) &
                (self.X['vehicle_age'] == self.X.iloc[i]['vehicle_age'])
            ]
            if val in matching_vehicles['fault_code'].values:
                correct += 1
        return correct / len(y_pred)

In [None]:
from tqdm import tqdm

class CustomAccuracy:
    def __init__(self, X, y):
        self.X = X.copy()
        self.y = y.copy()
        self.X['fault_code'] = self.y

        # Create a dictionary for fast lookup
        self.lookup_dict = self.X.groupby(['make', 'model', 'odometerValue', 'fuelType', 'engineSize', 'vehicle_age'])['fault_code'].apply(set).to_dict()

    def score(self, y_true, y_pred):
        correct = 0
        total = 0
        for i, val in tqdm(enumerate(y_pred), total=len(y_pred), desc='Calculating Accuracy'):
            key = tuple(self.X.iloc[i][['make', 'model', 'odometerValue', 'fuelType', 'engineSize', 'vehicle_age']].values)
            if key in self.lookup_dict and val in self.lookup_dict[key]:
                correct += 1
            total += 1
        return correct / total

In [None]:
#V3
from tqdm import tqdm

class CustomAccuracy:
    def __init__(self, X, y):
        self.X = X.copy()
        self.y = y.copy()
        self.X['fault_code'] = self.y

        # Create a dictionary for fast lookup
        self.lookup_dict = self.X.groupby(['model', 'odometerValue', 'vehicle_age'])['fault_code'].apply(set).to_dict()

    def score(self, y_true, y_pred):
        correct = 0
        total = 0
        for i, val in tqdm(enumerate(y_pred), total=len(y_pred), desc='Calculating Accuracy'):
            key = tuple(self.X.iloc[i][['model', 'odometerValue', 'vehicle_age']].values)
            if key in self.lookup_dict and val in self.lookup_dict[key]:
                correct += 1
            total += 1
        return correct / total

In [None]:
val_pred = pipeline_result.predict(val_set)
val_accuracy = accuracy_score(target_val_set, val_pred)
print(f"Validation set accuracy: {val_accuracy:.4f}")

Validation set accuracy: 0.1230


In [None]:
# V4
val_pred = pipeline_result.predict(val_set)
val_accuracy = accuracy_score(target_val_set, val_pred)
print(f"Validation set accuracy: {val_accuracy:.4f}")

Validation set accuracy: 0.1642


In [None]:
# V5
val_pred = pipeline_result.predict(val_set)
val_accuracy = accuracy_score(target_val_set, val_pred)
print(f"Validation set accuracy: {val_accuracy:.4f}")

Validation set accuracy: 0.1642


In [None]:
# V4
val_pred = pipeline_result.predict(val_set)
custom_accuracy = CustomAccuracy(val_set, target_val_set)  # Initialize the CustomAccuracy object
val_accuracy = custom_accuracy.score(target_val_set, val_pred)  # Calculate the accuracy
print(f"Validation set accuracy: {val_accuracy:.4f}")

Calculating Accuracy: 100%|██████████| 356811/356811 [03:44<00:00, 1587.29it/s]

Validation set accuracy: 0.2027





In [None]:
# V5
val_pred = pipeline_result.predict(val_set)
custom_accuracy = CustomAccuracy(val_set, target_val_set)  # Initialize the CustomAccuracy object
val_accuracy = custom_accuracy.score(target_val_set, val_pred)  # Calculate the accuracy
print(f"Validation set accuracy: {val_accuracy:.4f}")

Calculating Accuracy: 100%|██████████| 356811/356811 [03:46<00:00, 1572.90it/s]

Validation set accuracy: 0.2027





In [None]:
# V3
val_pred = pipeline_result.predict(val_set)
custom_accuracy = CustomAccuracy(val_set, target_val_set)  # Initialize the CustomAccuracy object
val_accuracy = custom_accuracy.score(target_val_set, val_pred)  # Calculate the accuracy
print(f"Validation set accuracy: {val_accuracy:.4f}")

Calculating Accuracy: 100%|██████████| 376574/376574 [03:58<00:00, 1577.28it/s]

Validation set accuracy: 0.1706





In [None]:
val_pred = pipeline_result.predict(val_set)
custom_accuracy = CustomAccuracy(val_set, target_val_set)  # Initialize the CustomAccuracy object
val_accuracy = custom_accuracy.score(target_val_set, val_pred)  # Calculate the accuracy
print(f"Validation set accuracy: {val_accuracy:.4f}")

Calculating Accuracy: 100%|██████████| 991444/991444 [10:30<00:00, 1573.54it/s]

Validation set accuracy: 0.1851





In [None]:
from sklearn.svm import LinearSVC

# Split the dataset for fault type prediction
train_set, val_set, target_train_set, target_val_set = train_test_split(final_code_df.drop('fault_code', axis=1), final_code_df['fault_code'], test_size=0.2, random_state=43)

# Fit and score the pipeline on the fault type prediction problem
pipeline_result = Pipeline([('preprocessor', preprocessor), ('classifier', LinearSVC(penalty='l2', dual=False, C=1.0))])
# Cross-validation
cv_scores_type = cross_val_score_with_progress(pipeline_result, train_set, target_train_set.squeeze())
mean_score_type = np.mean(cv_scores_type)

print(f'L1 regularisation for fault type prediction: Mean cross-validation accuracy = {mean_score_type:.4f}')

Cross-validation:   0%|          | 0/2 [00:00<?, ?it/s]

Starting fold 1


In [None]:
print(len(train_set))
print(len(target_train_set))

198288
198288


In [None]:
from joblib import dump

# Save the model
dump(pipeline_result, 'code_model2.joblib')

In [None]:
from google.colab import files

# Download the file to your local machine
files.download('code_model2.joblib')