#Import

In [1]:
import os
import json
import pandas as pd
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import KFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')
directory = '/content/gdrive/My Drive/mot_data/'

Mounted at /content/gdrive


#Dataframe

In [None]:
# Get the list of all files
files = [f for f in os.listdir(directory) if f.endswith(".json")]

In [None]:
# List to store DataFrames
dfs = []

# Iterate over each file and save each file to a dataframe
for filename in tqdm(files):
    try:
        with open(os.path.join(directory, filename)) as f:
            # Load the JSON data
            data = json.load(f)
            # For each vehicle, normalize its 'motTests' and drop the 'motTests' column
            for vehicle in data:
                try:
                    df = pd.json_normalize(vehicle, record_path='motTests', meta=[x for x in vehicle.keys() if x != 'motTests'])
                    dfs.append(df)
                except AttributeError:
                    print(f'Unexpected item format in file: {filename}')
    except json.JSONDecodeError:
        print(f'Error decoding JSON from file: {filename}')

100%|██████████| 1432/1432 [21:47<00:00,  1.10it/s]


In [None]:
# Concatenate all the DataFrames
final_df = pd.concat(dfs, ignore_index=True)

In [None]:
final_df.head()

Unnamed: 0,completedDate,testResult,expiryDate,odometerValue,odometerUnit,motTestNumber,odometerResultType,rfrAndComments,registration,make,model,firstUsedDate,fuelType,primaryColour,vehicleId,registrationDate,manufactureDate,engineSize
0,2011.11.04 16:28:40,PASSED,2012.11.15,85979,mi,995768201348,READ,[],DK52PJV,CITROEN,C15,2002.09.27,Diesel,White,iG510ZODBCm9eo-espVfmA==,2002.09.27,2002.09.27,1769
1,2011.11.04 09:56:39,FAILED,,85978,mi,286738801308,READ,[{'text': 'Front Windscreen wiper does not cle...,DK52PJV,CITROEN,C15,2002.09.27,Diesel,White,iG510ZODBCm9eo-espVfmA==,2002.09.27,2002.09.27,1769
2,2010.11.09 17:13:45,PASSED,2011.11.15,72396,mi,802553910308,READ,[],DK52PJV,CITROEN,C15,2002.09.27,Diesel,White,iG510ZODBCm9eo-espVfmA==,2002.09.27,2002.09.27,1769
3,2010.11.09 10:15:52,FAILED,,72396,mi,100593610353,READ,[{'text': 'Brake pedal anti-slip provision mis...,DK52PJV,CITROEN,C15,2002.09.27,Diesel,White,iG510ZODBCm9eo-espVfmA==,2002.09.27,2002.09.27,1769
4,2009.11.16 12:55:19,PASSED,2010.11.15,60546,mi,922800429343,READ,[],DK52PJV,CITROEN,C15,2002.09.27,Diesel,White,iG510ZODBCm9eo-espVfmA==,2002.09.27,2002.09.27,1769


In [None]:
final_df.to_pickle('/content/gdrive/My Drive/17jul11.pkl') # Save

In [None]:
print(len(final_df))

3821154


In [None]:
# CONVERTING KM TO MILES
# This condition finds rows where 'odometerUnit' is 'km'
condition = (final_df['odometerUnit'] == 'km')

# This line does the multiplication where the condition is True
final_df.loc[condition, 'odometerValue'] = (final_df.loc[condition, 'odometerValue'].astype(int) * 1.6).astype(int)

In [None]:
final_df.drop(columns=['vehicleId','primaryColour','registration','odometerResultType','motTestNumber','odometerUnit'], inplace=True)

In [None]:
# Convert registrationDate and completedDate to datetime objects if they aren't already
final_df['manufactureDate'] = pd.to_datetime(final_df['manufactureDate'])
final_df['completedDate'] = pd.to_datetime(final_df['completedDate'])


# Calculate vehicle age at the time of test
final_df['vehicle_age'] = (final_df['completedDate'] - final_df['manufactureDate']).dt.days / 365

In [None]:
cols = ['testResult','odometerValue','make','model','fuelType','engineSize','vehicle_age']
final_df[cols] = final_df[cols].replace({'':np.nan, ' ':np.nan})

# Now, you can drop the NaN values
final_df.dropna(subset=['testResult','odometerValue','make','model','fuelType','engineSize','vehicle_age'], inplace=True)

In [None]:
print(len(final_df))

1197313


In [None]:
final_df.head()

Unnamed: 0,completedDate,testResult,expiryDate,odometerValue,rfrAndComments,make,model,firstUsedDate,fuelType,registrationDate,manufactureDate,engineSize,vehicle_age
0,2011-11-04 16:28:40,PASSED,2012.11.15,85979,[],CITROEN,C15,2002.09.27,Diesel,2002.09.27,2002-09-27,1769,9.109589
1,2011-11-04 09:56:39,FAILED,,85978,[{'text': 'Front Windscreen wiper does not cle...,CITROEN,C15,2002.09.27,Diesel,2002.09.27,2002-09-27,1769,9.109589
2,2010-11-09 17:13:45,PASSED,2011.11.15,72396,[],CITROEN,C15,2002.09.27,Diesel,2002.09.27,2002-09-27,1769,8.123288
3,2010-11-09 10:15:52,FAILED,,72396,[{'text': 'Brake pedal anti-slip provision mis...,CITROEN,C15,2002.09.27,Diesel,2002.09.27,2002-09-27,1769,8.123288
4,2009-11-16 12:55:19,PASSED,2010.11.15,60546,[],CITROEN,C15,2002.09.27,Diesel,2002.09.27,2002-09-27,1769,7.142466


In [None]:
final_df.drop(columns=['registrationDate','completedDate', 'manufactureDate','firstUsedDate','expiryDate'], inplace=True)

In [None]:
final_df.head()

Unnamed: 0,testResult,odometerValue,rfrAndComments,make,model,fuelType,engineSize,vehicle_age
0,PASSED,85979,[],CITROEN,C15,Diesel,1769,9.109589
1,FAILED,85978,[{'text': 'Front Windscreen wiper does not cle...,CITROEN,C15,Diesel,1769,9.109589
2,PASSED,72396,[],CITROEN,C15,Diesel,1769,8.123288
3,FAILED,72396,[{'text': 'Brake pedal anti-slip provision mis...,CITROEN,C15,Diesel,1769,8.123288
4,PASSED,60546,[],CITROEN,C15,Diesel,1769,7.142466


In [None]:
final_df.to_pickle('/content/gdrive/My Drive/17jul12.pkl') # Save

In [None]:
print(len(final_df))

1197313


In [None]:
final_df.drop(columns=['rfrAndComments'], inplace=True)

In [None]:
final_df.head(10)

Unnamed: 0,testResult,odometerValue,make,model,fuelType,engineSize,vehicle_age
0,PASSED,85979,CITROEN,C15,Diesel,1769,9.109589
1,FAILED,85978,CITROEN,C15,Diesel,1769,9.109589
2,PASSED,72396,CITROEN,C15,Diesel,1769,8.123288
3,FAILED,72396,CITROEN,C15,Diesel,1769,8.123288
4,PASSED,60546,CITROEN,C15,Diesel,1769,7.142466
5,PASSED,45228,CITROEN,C15,Diesel,1769,5.956164
6,PASSED,33248,CITROEN,C15,Diesel,1769,4.961644
7,FAILED,33248,CITROEN,C15,Diesel,1769,4.958904
8,PASSED,93816,TOYOTA,AURIS,Diesel,1998,7.136986
9,PASSED,57687,TOYOTA,AURIS,Diesel,1998,5.882192


In [None]:
final_df.to_pickle('/content/gdrive/My Drive/17jul13.pkl') # Save

#PreProcessor

In [9]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.2-py2.py3-none-any.whl (81 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/81.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━[0m [32m71.7/81.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.8/81.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.2


In [10]:
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from category_encoders import TargetEncoder, BinaryEncoder
from sklearn.pipeline import Pipeline

In [11]:
# specify columns to be preprocessed and their corresponding preprocessing methods
num_attribs = ['odometerValue', 'engineSize', 'vehicle_age']
cat_attribs_high_cardinality = ['make', 'model']
cat_attribs_low_cardinality = ['fuelType']

num_pipeline = Pipeline([
    ('robust_scaler', RobustScaler())
])

cat_high_card_pipeline = Pipeline([
    ('target_encoder', TargetEncoder())
])

cat_low_card_pipeline = Pipeline([
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat_high_card", cat_high_card_pipeline, cat_attribs_high_cardinality),
    ("cat_low_card", cat_low_card_pipeline, cat_attribs_low_cardinality)
])

In [12]:
def cross_val_score_with_progress(pipeline, X, y, cv=5, scoring='accuracy'):
    kf = KFold(n_splits=cv)
    scores = []

    for i, (train_index, val_index) in enumerate(tqdm(kf.split(X), total=cv, desc='Cross-validation')):
        print(f"Starting fold {i+1}")
        train_X, val_X = X.iloc[train_index], X.iloc[val_index]
        train_y, val_y = y.iloc[train_index], y.iloc[val_index]

        pipeline.fit(train_X, train_y)

        if scoring == 'accuracy':
            pred_y = pipeline.predict(val_X)
            score = accuracy_score(val_y, pred_y)
        else:
            # Other scoring methods can be added here
            pass

        print(f"Finished fold {i+1}, score: {score}")
        scores.append(score)

    return np.array(scores)

#Pass or Fail

In [None]:
result_mapping = {code: i for i, code in enumerate(final_df['testResult'].unique())}
result_mapping

{'PASSED': 0, 'FAILED': 1}

In [None]:
result_df = final_df.copy()

In [None]:
result_df['testResult'] = result_df['testResult'].map(result_mapping)
result_df.head(10)

Unnamed: 0,testResult,odometerValue,make,model,fuelType,engineSize,vehicle_age
0,0,85979,CITROEN,C15,Diesel,1769,9.109589
1,1,85978,CITROEN,C15,Diesel,1769,9.109589
2,0,72396,CITROEN,C15,Diesel,1769,8.123288
3,1,72396,CITROEN,C15,Diesel,1769,8.123288
4,0,60546,CITROEN,C15,Diesel,1769,7.142466
5,0,45228,CITROEN,C15,Diesel,1769,5.956164
6,0,33248,CITROEN,C15,Diesel,1769,4.961644
7,1,33248,CITROEN,C15,Diesel,1769,4.958904
8,0,93816,TOYOTA,AURIS,Diesel,1998,7.136986
9,0,57687,TOYOTA,AURIS,Diesel,1998,5.882192


In [None]:
from sklearn.svm import LinearSVC

# Split the dataset for fault type prediction
train_set, val_set, target_train_set, target_val_set = train_test_split(result_df.drop('testResult', axis=1), result_df['testResult'], test_size=0.2, random_state=43)

# Fit and score the pipeline on the fault type prediction problem
pipeline_result = Pipeline([('preprocessor', preprocessor), ('classifier', LinearSVC(penalty='l2', dual=False, C=0.00001))])
# Cross-validation
cv_scores_type = cross_val_score_with_progress(pipeline_result, train_set, target_train_set.squeeze())
mean_score_type = np.mean(cv_scores_type)

print(f'L1 regularisation for fault type prediction: Mean cross-validation accuracy = {mean_score_type:.4f}')

Cross-validation:   0%|          | 0/5 [00:00<?, ?it/s]

Starting fold 1


Cross-validation:  20%|██        | 1/5 [00:22<01:28, 22.19s/it]

Finished fold 1, score: 0.7589497311687634
Starting fold 2


Cross-validation:  40%|████      | 2/5 [00:40<00:59, 19.78s/it]

Finished fold 2, score: 0.7585008091037219
Starting fold 3


Cross-validation:  60%|██████    | 3/5 [00:57<00:37, 18.83s/it]

Finished fold 3, score: 0.7578065459101112
Starting fold 4


Cross-validation:  80%|████████  | 4/5 [01:17<00:18, 18.97s/it]

Finished fold 4, score: 0.7583703085034191
Starting fold 5


Cross-validation: 100%|██████████| 5/5 [01:34<00:00, 18.94s/it]

Finished fold 5, score: 0.759121991961163





L1 regularisation for fault type prediction: Mean cross-validation accuracy = 0.7585


In [None]:
# Further Testing 9th Aug
result_df2 = pd.read_pickle('/content/gdrive/My Drive/17jul15.pkl') # Load

In [None]:
result_df2.head()

Unnamed: 0,testResult,odometerValue,make,model,fuelType,vehicleId,engineSize,vehicle_age
0,PASSED,63394,BENTLEY,CONTINENTAL,Petrol,-sqLf7Z0X3u5qjvB0luMEw==,5998,16.454795
1,PASSED,61969,BENTLEY,CONTINENTAL,Petrol,-sqLf7Z0X3u5qjvB0luMEw==,5998,15.438356
2,PASSED,57499,BENTLEY,CONTINENTAL,Petrol,-sqLf7Z0X3u5qjvB0luMEw==,5998,14.169863
3,PASSED,52275,BENTLEY,CONTINENTAL,Petrol,-sqLf7Z0X3u5qjvB0luMEw==,5998,13.115068
5,FAILED,51445,BENTLEY,CONTINENTAL,Petrol,-sqLf7Z0X3u5qjvB0luMEw==,5998,11.758904


In [None]:
result_df2.drop(columns=['vehicleId'], inplace=True)

In [None]:
result_df = result_df2

In [None]:
# Further Testing 9th Aug
from sklearn.svm import LinearSVC

# Split the dataset for fault type prediction
train_set, val_set, target_train_set, target_val_set = train_test_split(result_df.drop('testResult', axis=1), result_df['testResult'], test_size=0.2, random_state=43)

# Fit and score the pipeline on the fault type prediction problem
pipeline_result = Pipeline([('preprocessor', preprocessor), ('classifier', LogisticRegression(penalty='l2', solver='saga', max_iter=2000))])
# Cross-validation
cv_scores_type = cross_val_score_with_progress(pipeline_result, train_set, target_train_set.squeeze())
mean_score_type = np.mean(cv_scores_type)

print(f'L1 regularisation for fault type prediction: Mean cross-validation accuracy = {mean_score_type:.4f}')

Output hidden; open in https://colab.research.google.com to view.

In [None]:
result_df2 = pd.read_pickle('/content/gdrive/My Drive/17jul11.pkl') # Load

In [None]:
len(result_df2)

3821152

In [None]:
result_df2.head()

Unnamed: 0,completedDate,testResult,expiryDate,odometerValue,odometerUnit,motTestNumber,odometerResultType,rfrAndComments,registration,make,model,firstUsedDate,fuelType,primaryColour,vehicleId,registrationDate,manufactureDate,engineSize
0,2023.04.06 09:29:02,PASSED,2024.04.05,63394,mi,349231095921,READ,[],G7VSJ,BENTLEY,CONTINENTAL,2006.10.26,Petrol,Silver,-sqLf7Z0X3u5qjvB0luMEw==,2006.10.26,2006.10.26,5998
1,2022.03.31 10:13:22,PASSED,2023.03.30,61969,mi,762778382862,READ,[],G7VSJ,BENTLEY,CONTINENTAL,2006.10.26,Petrol,Silver,-sqLf7Z0X3u5qjvB0luMEw==,2006.10.26,2006.10.26,5998
2,2020.12.23 16:23:11,PASSED,2021.12.22,57499,mi,304831243538,READ,[],G7VSJ,BENTLEY,CONTINENTAL,2006.10.26,Petrol,Silver,-sqLf7Z0X3u5qjvB0luMEw==,2006.10.26,2006.10.26,5998
3,2019.12.04 11:37:37,PASSED,2020.12.03,52275,mi,529772850269,READ,[],G7VSJ,BENTLEY,CONTINENTAL,2006.10.26,Petrol,Silver,-sqLf7Z0X3u5qjvB0luMEw==,2006.10.26,2006.10.26,5998
4,2018.07.27 10:37:00,PASSED,2019.08.08,51445,mi,868646372084,READ,[],G7VSJ,BENTLEY,CONTINENTAL,2006.10.26,Petrol,Silver,-sqLf7Z0X3u5qjvB0luMEw==,2006.10.26,2006.10.26,5998


In [None]:
result_df2.drop(columns=['primaryColour','registration','odometerResultType','motTestNumber'], inplace=True)

In [None]:
result_df2.drop(columns=['rfrAndComments','expiryDate'], inplace=True)

In [None]:
result_df2['completedDate'] = pd.to_datetime(result_df2['completedDate'], errors='coerce')
result_df2 = result_df2.dropna(subset=['completedDate'])
result_df2['completedDate'] = result_df2['completedDate'].fillna(pd.Timestamp.min)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df2['completedDate'] = result_df2['completedDate'].fillna(pd.Timestamp.min)


In [None]:
len(result_df2)

3821152

In [None]:
result_df2.head()

Unnamed: 0,completedDate,testResult,odometerValue,odometerUnit,make,model,firstUsedDate,fuelType,vehicleId,registrationDate,manufactureDate,engineSize
0,2023-04-06 09:29:02,PASSED,63394,mi,BENTLEY,CONTINENTAL,2006.10.26,Petrol,-sqLf7Z0X3u5qjvB0luMEw==,2006.10.26,2006.10.26,5998
1,2022-03-31 10:13:22,PASSED,61969,mi,BENTLEY,CONTINENTAL,2006.10.26,Petrol,-sqLf7Z0X3u5qjvB0luMEw==,2006.10.26,2006.10.26,5998
2,2020-12-23 16:23:11,PASSED,57499,mi,BENTLEY,CONTINENTAL,2006.10.26,Petrol,-sqLf7Z0X3u5qjvB0luMEw==,2006.10.26,2006.10.26,5998
3,2019-12-04 11:37:37,PASSED,52275,mi,BENTLEY,CONTINENTAL,2006.10.26,Petrol,-sqLf7Z0X3u5qjvB0luMEw==,2006.10.26,2006.10.26,5998
4,2018-07-27 10:37:00,PASSED,51445,mi,BENTLEY,CONTINENTAL,2006.10.26,Petrol,-sqLf7Z0X3u5qjvB0luMEw==,2006.10.26,2006.10.26,5998


In [None]:
result_df2 = result_df2.sort_values(['vehicleId', 'completedDate'])

In [None]:
result_df2['within_14_days'] = result_df2.groupby('vehicleId')['completedDate'].diff().dt.days.lt(28)

In [None]:
result_df2 = result_df2[~result_df2['within_14_days']]

In [None]:
result_df2.head()

Unnamed: 0,completedDate,testResult,odometerValue,odometerUnit,make,model,firstUsedDate,fuelType,vehicleId,registrationDate,manufactureDate,engineSize,within_14_days
1491676,2009-07-08 14:37:08,PASSED,54059,mi,MG,A,1960.07.06,Petrol,---KvQbCM6tnWNeiaE1f1A==,1960.07.06,1960.07.06,1622,False
1491674,2010-07-06 14:35:16,PASSED,54393,mi,MG,A,1960.07.06,Petrol,---KvQbCM6tnWNeiaE1f1A==,1960.07.06,1960.07.06,1622,False
1491673,2011-09-09 13:42:23,PASSED,54553,mi,MG,A,1960.07.06,Petrol,---KvQbCM6tnWNeiaE1f1A==,1960.07.06,1960.07.06,1622,False
1491672,2012-09-07 13:39:39,PASSED,54591,mi,MG,A,1960.07.06,Petrol,---KvQbCM6tnWNeiaE1f1A==,1960.07.06,1960.07.06,1622,False
1491671,2013-09-09 14:10:08,PASSED,54680,mi,MG,A,1960.07.06,Petrol,---KvQbCM6tnWNeiaE1f1A==,1960.07.06,1960.07.06,1622,False


In [None]:
# If you want to restore the original order
result_df2 = result_df2.sort_index()

In [None]:
result_df2.head()

Unnamed: 0,completedDate,testResult,odometerValue,odometerUnit,make,model,firstUsedDate,fuelType,vehicleId,registrationDate,manufactureDate,engineSize,within_14_days
0,2023-04-06 09:29:02,PASSED,63394,mi,BENTLEY,CONTINENTAL,2006.10.26,Petrol,-sqLf7Z0X3u5qjvB0luMEw==,2006.10.26,2006.10.26,5998,False
1,2022-03-31 10:13:22,PASSED,61969,mi,BENTLEY,CONTINENTAL,2006.10.26,Petrol,-sqLf7Z0X3u5qjvB0luMEw==,2006.10.26,2006.10.26,5998,False
2,2020-12-23 16:23:11,PASSED,57499,mi,BENTLEY,CONTINENTAL,2006.10.26,Petrol,-sqLf7Z0X3u5qjvB0luMEw==,2006.10.26,2006.10.26,5998,False
3,2019-12-04 11:37:37,PASSED,52275,mi,BENTLEY,CONTINENTAL,2006.10.26,Petrol,-sqLf7Z0X3u5qjvB0luMEw==,2006.10.26,2006.10.26,5998,False
5,2018-07-27 10:36:59,FAILED,51445,mi,BENTLEY,CONTINENTAL,2006.10.26,Petrol,-sqLf7Z0X3u5qjvB0luMEw==,2006.10.26,2006.10.26,5998,False


In [None]:
len(result_df2)

2952627

In [None]:
result_df2.to_pickle('/content/gdrive/My Drive/17jul14.pkl') # Save

In [None]:
# CONVERTING KM TO MILES
# This condition finds rows where 'odometerUnit' is 'km'
condition = (result_df2['odometerUnit'] == 'km')

# This line does the multiplication where the condition is True
result_df2.loc[condition, 'odometerValue'] = (result_df2.loc[condition, 'odometerValue'].astype(int) * 1.6).astype(int)

# Convert registrationDate and completedDate to datetime objects if they aren't already
result_df2['manufactureDate'] = pd.to_datetime(result_df2['manufactureDate'])
result_df2['completedDate'] = pd.to_datetime(result_df2['completedDate'])


# Calculate vehicle age at the time of test
result_df2['vehicle_age'] = (result_df2['completedDate'] - result_df2['manufactureDate']).dt.days / 365

cols = ['testResult','odometerValue','make','model','fuelType','engineSize','vehicle_age']
result_df2[cols] = result_df2[cols].replace({'':np.nan, ' ':np.nan})

# Now, you can drop the NaN values
result_df2.dropna(subset=['testResult','odometerValue','make','model','fuelType','engineSize','vehicle_age'], inplace=True)

In [None]:
result_df2.head()

Unnamed: 0,completedDate,testResult,odometerValue,odometerUnit,make,model,firstUsedDate,fuelType,vehicleId,registrationDate,manufactureDate,engineSize,within_14_days,vehicle_age
0,2023-04-06 09:29:02,PASSED,63394,mi,BENTLEY,CONTINENTAL,2006.10.26,Petrol,-sqLf7Z0X3u5qjvB0luMEw==,2006.10.26,2006-10-26,5998,False,16.454795
1,2022-03-31 10:13:22,PASSED,61969,mi,BENTLEY,CONTINENTAL,2006.10.26,Petrol,-sqLf7Z0X3u5qjvB0luMEw==,2006.10.26,2006-10-26,5998,False,15.438356
2,2020-12-23 16:23:11,PASSED,57499,mi,BENTLEY,CONTINENTAL,2006.10.26,Petrol,-sqLf7Z0X3u5qjvB0luMEw==,2006.10.26,2006-10-26,5998,False,14.169863
3,2019-12-04 11:37:37,PASSED,52275,mi,BENTLEY,CONTINENTAL,2006.10.26,Petrol,-sqLf7Z0X3u5qjvB0luMEw==,2006.10.26,2006-10-26,5998,False,13.115068
5,2018-07-27 10:36:59,FAILED,51445,mi,BENTLEY,CONTINENTAL,2006.10.26,Petrol,-sqLf7Z0X3u5qjvB0luMEw==,2006.10.26,2006-10-26,5998,False,11.758904


In [None]:
result_df2.drop(columns=['registrationDate','completedDate', 'manufactureDate','firstUsedDate','odometerUnit'], inplace=True)

In [None]:
result_df2.drop(columns=['within_14_days'], inplace=True)

In [None]:
result_df2.head()

Unnamed: 0,testResult,odometerValue,make,model,fuelType,vehicleId,engineSize,vehicle_age
0,PASSED,63394,BENTLEY,CONTINENTAL,Petrol,-sqLf7Z0X3u5qjvB0luMEw==,5998,16.454795
1,PASSED,61969,BENTLEY,CONTINENTAL,Petrol,-sqLf7Z0X3u5qjvB0luMEw==,5998,15.438356
2,PASSED,57499,BENTLEY,CONTINENTAL,Petrol,-sqLf7Z0X3u5qjvB0luMEw==,5998,14.169863
3,PASSED,52275,BENTLEY,CONTINENTAL,Petrol,-sqLf7Z0X3u5qjvB0luMEw==,5998,13.115068
5,FAILED,51445,BENTLEY,CONTINENTAL,Petrol,-sqLf7Z0X3u5qjvB0luMEw==,5998,11.758904


In [None]:
result_df2.to_pickle('/content/gdrive/My Drive/17jul15.pkl') # Save

In [3]:
result_df2 = pd.read_pickle('/content/gdrive/My Drive/17jul15.pkl') # Load

In [4]:
result_mapping = {code: i for i, code in enumerate(result_df2['testResult'].unique())}

result_df2['testResult'] = result_df2['testResult'].map(result_mapping)
result_df2.head(10)

Unnamed: 0,testResult,odometerValue,make,model,fuelType,vehicleId,engineSize,vehicle_age
0,0,63394,BENTLEY,CONTINENTAL,Petrol,-sqLf7Z0X3u5qjvB0luMEw==,5998,16.454795
1,0,61969,BENTLEY,CONTINENTAL,Petrol,-sqLf7Z0X3u5qjvB0luMEw==,5998,15.438356
2,0,57499,BENTLEY,CONTINENTAL,Petrol,-sqLf7Z0X3u5qjvB0luMEw==,5998,14.169863
3,0,52275,BENTLEY,CONTINENTAL,Petrol,-sqLf7Z0X3u5qjvB0luMEw==,5998,13.115068
5,1,51445,BENTLEY,CONTINENTAL,Petrol,-sqLf7Z0X3u5qjvB0luMEw==,5998,11.758904
6,0,50821,BENTLEY,CONTINENTAL,Petrol,-sqLf7Z0X3u5qjvB0luMEw==,5998,10.794521
7,0,45596,BENTLEY,CONTINENTAL,Petrol,-sqLf7Z0X3u5qjvB0luMEw==,5998,7.654795
8,0,43318,BENTLEY,CONTINENTAL,Petrol,-sqLf7Z0X3u5qjvB0luMEw==,5998,6.690411
9,0,41751,BENTLEY,CONTINENTAL,Petrol,-sqLf7Z0X3u5qjvB0luMEw==,5998,5.712329
10,0,39200,BENTLEY,CONTINENTAL,Petrol,-sqLf7Z0X3u5qjvB0luMEw==,5998,4.986301


In [5]:
len(result_df2)

2871579

In [None]:
len(result_df2)

920930

In [6]:
result_df2.drop(columns=['vehicleId'], inplace=True)

In [None]:
# TEMP
result_df2.drop(columns=['fuelType'], inplace=True)

In [7]:
result_df2.head(10)

Unnamed: 0,testResult,odometerValue,make,model,fuelType,engineSize,vehicle_age
0,0,63394,BENTLEY,CONTINENTAL,Petrol,5998,16.454795
1,0,61969,BENTLEY,CONTINENTAL,Petrol,5998,15.438356
2,0,57499,BENTLEY,CONTINENTAL,Petrol,5998,14.169863
3,0,52275,BENTLEY,CONTINENTAL,Petrol,5998,13.115068
5,1,51445,BENTLEY,CONTINENTAL,Petrol,5998,11.758904
6,0,50821,BENTLEY,CONTINENTAL,Petrol,5998,10.794521
7,0,45596,BENTLEY,CONTINENTAL,Petrol,5998,7.654795
8,0,43318,BENTLEY,CONTINENTAL,Petrol,5998,6.690411
9,0,41751,BENTLEY,CONTINENTAL,Petrol,5998,5.712329
10,0,39200,BENTLEY,CONTINENTAL,Petrol,5998,4.986301


In [None]:
# TEMP
make_counts = result_df2['make'].value_counts()

# Get the makes with frequency less than 3
makes_less_than_three = make_counts[make_counts < 10]

print(makes_less_than_three)

RAM/LRREPLICA XKSS    9
ARROW                 9
ARMSTRONG SLDDLEY     9
SPECAIL               9
YUGO                  9
                     ..
FRANCES-BARNETT       1
85-D-33               1
DUKKOPP               1
ASTON MARTON          1
ALTON 3;8L            1
Name: make, Length: 738, dtype: int64


In [None]:
len(result_df2)

2871579

In [None]:
# TEMP

# Get a boolean mask of rows with makes that have a count less than 3
mask = result_df2['make'].isin(makes_less_than_three.index)

# Use ~ to negate the mask, selecting only rows with makes that have a count of 3 or more
result_df2 = result_df2[~mask]

In [None]:
len(result_df2)

2868745

In [None]:
# TEMP
model_counts = result_df2['model'].value_counts()

# Get the makes with frequency less than 3
model_less_than_three = model_counts[model_counts < 20]

print(model_less_than_three)

SUPER-ROCKET                 19
MJ                           19
VEGAS                        19
ZRX1200S                     19
R 300                        19
                             ..
SCORPIONE 1300S               1
DROP  HEAD                    1
A65T                          1
DIABLO VT ROADSTER            1
CARAVELLE SE TDI 180 AUTO     1
Name: model, Length: 7429, dtype: int64


In [None]:
# TEMP

# Get a boolean mask of rows with makes that have a count less than 3
mask2 = result_df2['model'].isin(model_less_than_three.index)

# Use ~ to negate the mask, selecting only rows with makes that have a count of 3 or more
result_df2 = result_df2[~mask2]

In [None]:
len(result_df2)

2817582

In [None]:
# TEMP

# specify columns to be preprocessed and their corresponding preprocessing methods
num_attribs = ['odometerValue', 'engineSize', 'vehicle_age']
cat_attribs_high_cardinality = ['make', 'model']

num_pipeline = Pipeline([
    ('robust_scaler', RobustScaler())
])

cat_high_card_pipeline = Pipeline([
    ('target_encoder', TargetEncoder())
])

cat_low_card_pipeline = Pipeline([
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor2 = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat_high_card", cat_high_card_pipeline, cat_attribs_high_cardinality)
])

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import LinearSVC

# Split the dataset for fault type prediction
train_set, val_set, target_train_set, target_val_set = train_test_split(result_df2.drop('testResult', axis=1), result_df2['testResult'], test_size=0.2, random_state=43)

# Fit and score the pipeline on the fault type prediction problem
pipeline_result = Pipeline([('preprocessor', preprocessor2), ('classifier', LinearSVC(penalty='l2', dual=False, C=1.0))])
# Cross-validation
cv_scores_type = cross_val_score_with_progress(pipeline_result, train_set, target_train_set.squeeze())
mean_score_type = np.mean(cv_scores_type)

print(f'L1 regularisation for fault type prediction: Mean cross-validation accuracy = {mean_score_type:.4f}')

Cross-validation:   0%|          | 0/5 [00:00<?, ?it/s]

Starting fold 1


Cross-validation:  20%|██        | 1/5 [00:15<01:00, 15.21s/it]

Finished fold 1, score: 0.7284488293688364
Starting fold 2


Cross-validation:  40%|████      | 2/5 [00:30<00:45, 15.08s/it]

Finished fold 2, score: 0.7280461766491894
Starting fold 3


Cross-validation:  60%|██████    | 3/5 [00:45<00:30, 15.22s/it]

Finished fold 3, score: 0.7283639458225325
Starting fold 4


Cross-validation:  80%|████████  | 4/5 [01:00<00:15, 15.18s/it]

Finished fold 4, score: 0.7286201823041363
Starting fold 5


Cross-validation: 100%|██████████| 5/5 [01:16<00:00, 15.22s/it]

Finished fold 5, score: 0.7279215239023881
L1 regularisation for fault type prediction: Mean cross-validation accuracy = 0.7283





In [None]:
# TEMP

# specify columns to be preprocessed and their corresponding preprocessing methods
num_attribs = ['odometerValue', 'engineSize', 'vehicle_age']
cat_attribs_high_cardinality = ['make', 'model']

num_pipeline = Pipeline([
    ('robust_scaler', RobustScaler())
])

cat_high_card_pipeline = Pipeline([
    ('target_encoder', TargetEncoder())
])

cat_low_card_pipeline = Pipeline([
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor2 = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat_high_card", cat_high_card_pipeline, cat_attribs_high_cardinality)
])

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import LinearSVC

# Split the dataset for fault type prediction
train_set, val_set, target_train_set, target_val_set = train_test_split(result_df2.drop('testResult', axis=1), result_df2['testResult'], test_size=0.2, random_state=43)

# Fit and score the pipeline on the fault type prediction problem
pipeline_result = Pipeline([('preprocessor', preprocessor2), ('classifier', LinearSVC(penalty='l2', dual=False, C=1.0))])
# Cross-validation
cv_scores_type = cross_val_score_with_progress(pipeline_result, train_set, target_train_set.squeeze())
mean_score_type = np.mean(cv_scores_type)

print(f'L1 regularisation for fault type prediction: Mean cross-validation accuracy = {mean_score_type:.4f}')

Cross-validation:   0%|          | 0/5 [00:00<?, ?it/s]

Starting fold 1


Cross-validation:  20%|██        | 1/5 [00:14<00:58, 14.53s/it]

Finished fold 1, score: 0.7264564242823521
Starting fold 2


Cross-validation:  40%|████      | 2/5 [00:29<00:43, 14.51s/it]

Finished fold 2, score: 0.7269000672119038
Starting fold 3


Cross-validation:  60%|██████    | 3/5 [00:43<00:29, 14.53s/it]

Finished fold 3, score: 0.7259905992063228
Starting fold 4


Cross-validation:  80%|████████  | 4/5 [00:58<00:14, 14.51s/it]

Finished fold 4, score: 0.7270353783054171
Starting fold 5


Cross-validation: 100%|██████████| 5/5 [01:13<00:00, 14.61s/it]

Finished fold 5, score: 0.7270353783054171
L1 regularisation for fault type prediction: Mean cross-validation accuracy = 0.7267





In [None]:
# TEMP
result_df2.drop(columns=['engineSize'], inplace=True)
result_df2.head()

Unnamed: 0,testResult,odometerValue,make,model,vehicle_age
0,0,63394,BENTLEY,CONTINENTAL,16.454795
1,0,61969,BENTLEY,CONTINENTAL,15.438356
2,0,57499,BENTLEY,CONTINENTAL,14.169863
3,0,52275,BENTLEY,CONTINENTAL,13.115068
5,1,51445,BENTLEY,CONTINENTAL,11.758904


In [None]:
# TEMP

# specify columns to be preprocessed and their corresponding preprocessing methods
num_attribs = ['odometerValue', 'vehicle_age']
cat_attribs_high_cardinality = ['make', 'model']

num_pipeline = Pipeline([
    ('robust_scaler', RobustScaler())
])

cat_high_card_pipeline = Pipeline([
    ('target_encoder', TargetEncoder())
])

cat_low_card_pipeline = Pipeline([
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor2 = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat_high_card", cat_high_card_pipeline, cat_attribs_high_cardinality)
])

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import LinearSVC

# Split the dataset for fault type prediction
train_set, val_set, target_train_set, target_val_set = train_test_split(result_df2.drop('testResult', axis=1), result_df2['testResult'], test_size=0.2, random_state=43)

# Fit and score the pipeline on the fault type prediction problem
pipeline_result = Pipeline([('preprocessor', preprocessor2), ('classifier', LinearSVC(penalty='l2', dual=False, C=1.0))])
# Cross-validation
cv_scores_type = cross_val_score_with_progress(pipeline_result, train_set, target_train_set.squeeze())
mean_score_type = np.mean(cv_scores_type)

print(f'L1 regularisation for fault type prediction: Mean cross-validation accuracy = {mean_score_type:.4f}')

Cross-validation:   0%|          | 0/5 [00:00<?, ?it/s]

Starting fold 1


Cross-validation:  20%|██        | 1/5 [00:13<00:52, 13.06s/it]

Finished fold 1, score: 0.7264342421358745
Starting fold 2


Cross-validation:  40%|████      | 2/5 [00:25<00:38, 12.98s/it]

Finished fold 2, score: 0.7269089400704949
Starting fold 3


Cross-validation:  60%|██████    | 3/5 [00:38<00:25, 12.90s/it]

Finished fold 3, score: 0.7259772899184362
Starting fold 4


Cross-validation:  80%|████████  | 4/5 [00:51<00:12, 12.99s/it]

Finished fold 4, score: 0.7270154143735873
Starting fold 5


Cross-validation: 100%|██████████| 5/5 [01:05<00:00, 13.06s/it]

Finished fold 5, score: 0.7270619968811902
L1 regularisation for fault type prediction: Mean cross-validation accuracy = 0.7267





In [None]:
# TEMP
result_df2.drop(columns=['make'], inplace=True)
result_df2.head()

Unnamed: 0,testResult,odometerValue,model,vehicle_age
0,0,63394,CONTINENTAL,16.454795
1,0,61969,CONTINENTAL,15.438356
2,0,57499,CONTINENTAL,14.169863
3,0,52275,CONTINENTAL,13.115068
5,1,51445,CONTINENTAL,11.758904


In [None]:
# TEMP

# specify columns to be preprocessed and their corresponding preprocessing methods
num_attribs = ['odometerValue', 'vehicle_age']
cat_attribs_high_cardinality = ['model']

num_pipeline = Pipeline([
    ('robust_scaler', RobustScaler())
])

cat_high_card_pipeline = Pipeline([
    ('target_encoder', TargetEncoder())
])

cat_low_card_pipeline = Pipeline([
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor2 = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat_high_card", cat_high_card_pipeline, cat_attribs_high_cardinality)
])

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import LinearSVC

# Split the dataset for fault type prediction
train_set, val_set, target_train_set, target_val_set = train_test_split(result_df2.drop('testResult', axis=1), result_df2['testResult'], test_size=0.2, random_state=43)

# Fit and score the pipeline on the fault type prediction problem
pipeline_result = Pipeline([('preprocessor', preprocessor2), ('classifier', LinearSVC(penalty='l2', dual=False, C=1.0))])
# Cross-validation
cv_scores_type = cross_val_score_with_progress(pipeline_result, train_set, target_train_set.squeeze())
mean_score_type = np.mean(cv_scores_type)

print(f'L1 regularisation for fault type prediction: Mean cross-validation accuracy = {mean_score_type:.4f}')

Cross-validation:   0%|          | 0/5 [00:00<?, ?it/s]

Starting fold 1


Cross-validation:  20%|██        | 1/5 [00:07<00:31,  7.82s/it]

Finished fold 1, score: 0.7264985703606596
Starting fold 2


Cross-validation:  40%|████      | 2/5 [00:15<00:23,  7.90s/it]

Finished fold 2, score: 0.72682464791388
Starting fold 3


Cross-validation:  60%|██████    | 3/5 [00:23<00:15,  7.83s/it]

Finished fold 3, score: 0.7260371817139257
Starting fold 4


Cross-validation:  80%|████████  | 4/5 [00:31<00:07,  7.86s/it]

Finished fold 4, score: 0.7269399950755635
Starting fold 5


Cross-validation: 100%|██████████| 5/5 [00:39<00:00,  7.88s/it]

Finished fold 5, score: 0.7269510861488023
L1 regularisation for fault type prediction: Mean cross-validation accuracy = 0.7267





In [None]:
# TEMP
result_df2.drop(columns=['model'], inplace=True)
result_df2.head()

Unnamed: 0,testResult,odometerValue,vehicle_age
0,0,63394,16.454795
1,0,61969,15.438356
2,0,57499,14.169863
3,0,52275,13.115068
5,1,51445,11.758904


In [None]:
# TEMP

# specify columns to be preprocessed and their corresponding preprocessing methods
num_attribs = ['odometerValue', 'vehicle_age']

num_pipeline = Pipeline([
    ('robust_scaler', RobustScaler())
])

preprocessor2 = ColumnTransformer([
    ("num", num_pipeline, num_attribs)
])

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import LinearSVC

# Split the dataset for fault type prediction
train_set, val_set, target_train_set, target_val_set = train_test_split(result_df2.drop('testResult', axis=1), result_df2['testResult'], test_size=0.2, random_state=43)

# Fit and score the pipeline on the fault type prediction problem
pipeline_result = Pipeline([('preprocessor', preprocessor2), ('classifier', LinearSVC(penalty='l2', dual=False, C=1.0))])
# Cross-validation
cv_scores_type = cross_val_score_with_progress(pipeline_result, train_set, target_train_set.squeeze())
mean_score_type = np.mean(cv_scores_type)

print(f'L1 regularisation for fault type prediction: Mean cross-validation accuracy = {mean_score_type:.4f}')

Cross-validation:   0%|          | 0/5 [00:00<?, ?it/s]

Starting fold 1


Cross-validation:  20%|██        | 1/5 [00:02<00:10,  2.71s/it]

Finished fold 1, score: 0.7258264513223887
Starting fold 2


Cross-validation:  40%|████      | 2/5 [00:05<00:08,  2.67s/it]

Finished fold 2, score: 0.7259528895573109
Starting fold 3


Cross-validation:  60%|██████    | 3/5 [00:08<00:05,  2.66s/it]

Finished fold 3, score: 0.7253561898170638
Starting fold 4


Cross-validation:  80%|████████  | 4/5 [00:10<00:02,  2.67s/it]

Finished fold 4, score: 0.7263898778429193
Starting fold 5


Cross-validation: 100%|██████████| 5/5 [00:13<00:00,  2.67s/it]

Finished fold 5, score: 0.7261924567392688
L1 regularisation for fault type prediction: Mean cross-validation accuracy = 0.7259





In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import LinearSVC

# Split the dataset for fault type prediction
train_set, val_set, target_train_set, target_val_set = train_test_split(result_df2.drop('testResult', axis=1), result_df2['testResult'], test_size=0.2, random_state=43)

# Fit and score the pipeline on the fault type prediction problem
pipeline_result = Pipeline([('preprocessor', preprocessor), ('classifier', GradientBoostingClassifier())])
# Cross-validation
cv_scores_type = cross_val_score_with_progress(pipeline_result, train_set, target_train_set.squeeze())
mean_score_type = np.mean(cv_scores_type)

print(f'L1 regularisation for fault type prediction: Mean cross-validation accuracy = {mean_score_type:.4f}')

Cross-validation:   0%|          | 0/5 [00:00<?, ?it/s]

Starting fold 1


Cross-validation:  20%|██        | 1/5 [07:56<31:44, 476.01s/it]

Finished fold 1, score: 0.7317766996841897
Starting fold 2


Cross-validation:  40%|████      | 2/5 [16:10<24:21, 487.03s/it]

Finished fold 2, score: 0.7318833482423664
Starting fold 3


Cross-validation:  60%|██████    | 3/5 [24:27<16:22, 491.34s/it]

Finished fold 3, score: 0.732018291315978
Starting fold 4


Cross-validation:  80%|████████  | 4/5 [32:28<08:07, 487.46s/it]

Finished fold 4, score: 0.7318544701078676
Starting fold 5


Cross-validation: 100%|██████████| 5/5 [40:29<00:00, 485.97s/it]

Finished fold 5, score: 0.7312145773660796





L1 regularisation for fault type prediction: Mean cross-validation accuracy = 0.7317


In [13]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import LinearSVC

# Split the dataset for fault type prediction
train_set, val_set, target_train_set, target_val_set = train_test_split(result_df2.drop('testResult', axis=1), result_df2['testResult'], test_size=0.2, random_state=43)

# Fit and score the pipeline on the fault type prediction problem
pipeline_result = Pipeline([('preprocessor', preprocessor), ('classifier', LogisticRegression(penalty='l2', solver='saga', max_iter=2000))])
# Cross-validation
cv_scores_type = cross_val_score_with_progress(pipeline_result, train_set, target_train_set.squeeze())
mean_score_type = np.mean(cv_scores_type)

print(f'L1 regularisation for fault type prediction: Mean cross-validation accuracy = {mean_score_type:.4f}')

Cross-validation:   0%|          | 0/5 [00:00<?, ?it/s]

Starting fold 1


Cross-validation:  20%|██        | 1/5 [07:14<28:56, 434.22s/it]

Finished fold 1, score: 0.7282638267679175
Starting fold 2


Cross-validation:  40%|████      | 2/5 [14:39<22:01, 440.49s/it]

Finished fold 2, score: 0.7279395280910126
Starting fold 3


Cross-validation:  60%|██████    | 3/5 [22:47<15:25, 462.54s/it]

Finished fold 3, score: 0.7281484722049916
Starting fold 4


Cross-validation:  80%|████████  | 4/5 [28:21<06:51, 411.61s/it]

Finished fold 4, score: 0.7285135335138382
Starting fold 5


Cross-validation: 100%|██████████| 5/5 [35:01<00:00, 420.31s/it]

Finished fold 5, score: 0.7278126986061656





L1 regularisation for fault type prediction: Mean cross-validation accuracy = 0.7281


In [14]:
import pickle
# Save the model
with open('/content/gdrive/My Drive/result_model2.pkl', 'wb') as f:
    pickle.dump(pipeline_result, f)

In [None]:
val_pred = pipeline_result.predict(val_set)
val_accuracy = accuracy_score(target_val_set, val_pred)
print(f"Validation set accuracy: {val_accuracy:.4f}")

Validation set accuracy: 0.7313


In [None]:
from tqdm.auto import tqdm
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

def cross_val_score_with_progress2(pipeline, X, y, cv=2, scoring='accuracy'):
    kf = KFold(n_splits=cv)
    scores = []

    pbar = tqdm(total=cv, desc='Cross-validation') # Initialize progress bar

    for i, (train_index, val_index) in enumerate(kf.split(X)):
        print(f"Starting fold {i+1}")
        train_X, val_X = X.iloc[train_index], X.iloc[val_index]
        train_y, val_y = y.iloc[train_index], y.iloc[val_index]

        pipeline.fit(train_X, train_y)

        if scoring == 'accuracy':
            pred_y = pipeline.predict(val_X)
            score = accuracy_score(val_y, pred_y)
        else:
            # Other scoring methods can be added here
            pass

        print(f"Finished fold {i+1}, score: {score}")
        scores.append(score)

        pbar.update(1)  # Update progress bar

    pbar.close()  # Close progress bar
    return np.array(scores)

In [None]:
from sklearn.svm import LinearSVC

# Split the dataset for fault type prediction
train_set, val_set, target_train_set, target_val_set = train_test_split(result_df2.drop('testResult', axis=1), result_df2['testResult'], test_size=0.2, random_state=43)

# Fit and score the pipeline on the fault type prediction problem
pipeline_result2 = Pipeline([('preprocessor', preprocessor), ('classifier', LinearSVC(penalty='l2', dual=False, C=1.0))])
# Cross-validation
cv_scores_type = cross_val_score_with_progress2(pipeline_result, train_set, target_train_set.squeeze())
mean_score_type = np.mean(cv_scores_type)

print(f'L1 regularisation for fault type prediction: Mean cross-validation accuracy = {mean_score_type:.4f}')

Cross-validation:   0%|          | 0/2 [00:00<?, ?it/s]

Starting fold 1
Finished fold 1, score: 0.7315937567471567
Starting fold 2
Finished fold 2, score: 0.7316118057060971
L1 regularisation for fault type prediction: Mean cross-validation accuracy = 0.7316


In [None]:
len(result_df2)

2871579

In [None]:
import pickle
# Save the model
with open('/content/gdrive/My Drive/result_model.pkl', 'wb') as f:
    pickle.dump(pipeline_result, f)

In [None]:
from joblib import dump

# Save the model
dump(pipeline_result, 'result_model2.joblib')

['result_model2.joblib']

In [None]:
from google.colab import files

# Download the file to your local machine
files.download('result_model2.joblib')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from sklearn.svm import LinearSVC

# Split the dataset for fault type prediction
train_set, val_set, target_train_set, target_val_set = train_test_split(result_df2.drop('testResult', axis=1), result_df2['testResult'], test_size=0.2, random_state=43)

# Fit and score the pipeline on the fault type prediction problem
pipeline_result = Pipeline([('preprocessor', preprocessor), ('classifier', LinearSVC(penalty='l2', dual=False, C=1.0))])
# Cross-validation
cv_scores_type = cross_val_score_with_progress(pipeline_result, train_set, target_train_set.squeeze())
mean_score_type = np.mean(cv_scores_type)

print(f'L1 regularisation for fault type prediction: Mean cross-validation accuracy = {mean_score_type:.4f}')

Cross-validation:   0%|          | 0/5 [00:00<?, ?it/s]

Starting fold 1


Cross-validation:  20%|██        | 1/5 [00:28<01:52, 28.23s/it]

Finished fold 1, score: 0.7284357703617127
Starting fold 2


Cross-validation:  40%|████      | 2/5 [00:55<01:23, 27.92s/it]

Finished fold 2, score: 0.7281615312121152
Starting fold 3


Cross-validation:  60%|██████    | 3/5 [01:23<00:55, 27.63s/it]

Finished fold 3, score: 0.7284401233640873
Starting fold 4


Cross-validation:  80%|████████  | 4/5 [01:50<00:27, 27.64s/it]

Finished fold 4, score: 0.7287072425411142
Starting fold 5


Cross-validation: 100%|██████████| 5/5 [02:18<00:00, 27.61s/it]

Finished fold 5, score: 0.7280455847400816





L1 regularisation for fault type prediction: Mean cross-validation accuracy = 0.7284


In [None]:
#9th Aug
from sklearn.svm import LinearSVC

# Split the dataset for fault type prediction
train_set, val_set, target_train_set, target_val_set = train_test_split(result_df2.drop('testResult', axis=1), result_df2['testResult'], test_size=0.2, random_state=43)

# Fit and score the pipeline on the fault type prediction problem
pipeline_result = Pipeline([('preprocessor', preprocessor), ('classifier', LinearSVC(penalty='l2', dual=False, C=0.1))])
# Cross-validation
cv_scores_type = cross_val_score_with_progress(pipeline_result, train_set, target_train_set.squeeze())
mean_score_type = np.mean(cv_scores_type)

print(f'L1 regularisation for fault type prediction: Mean cross-validation accuracy = {mean_score_type:.4f}')

Cross-validation:   0%|          | 0/5 [00:00<?, ?it/s]

Starting fold 1


Cross-validation:   0%|          | 0/5 [13:49<?, ?it/s]


KeyboardInterrupt: ignored

In [None]:
# Fit and score the pipeline on the fault type prediction problem
pipeline_result = Pipeline([('preprocessor', preprocessor), ('classifier', LinearSVC(penalty='l2', dual=False, C=1.0))])
# Cross-validation
cv_scores_type = cross_val_score_with_progress(pipeline_result, train_set, target_train_set.squeeze())
mean_score_type = np.mean(cv_scores_type)

print(f'L1 regularisation for fault type prediction: Mean cross-validation accuracy = {mean_score_type:.4f}')

Cross-validation:   0%|          | 0/5 [00:00<?, ?it/s]

Starting fold 1


Cross-validation:  20%|██        | 1/5 [00:08<00:32,  8.04s/it]

Finished fold 1, score: 0.7279655783208573
Starting fold 2


Cross-validation:  40%|████      | 2/5 [00:15<00:22,  7.42s/it]

Finished fold 2, score: 0.725359520593964
Starting fold 3


Cross-validation:  60%|██████    | 3/5 [00:23<00:15,  7.73s/it]

Finished fold 3, score: 0.7247758722488785
Starting fold 4


Cross-validation:  80%|████████  | 4/5 [00:29<00:07,  7.22s/it]

Finished fold 4, score: 0.7270901058032291
Starting fold 5


Cross-validation: 100%|██████████| 5/5 [00:37<00:00,  7.55s/it]

Finished fold 5, score: 0.7255680430002444
L1 regularisation for fault type prediction: Mean cross-validation accuracy = 0.7262





In [None]:
# Fit and score the pipeline on the fault type prediction problem
pipeline_result = Pipeline([('preprocessor', preprocessor), ('classifier', LinearSVC(penalty='l2', dual=False, C=10.0))])
# Cross-validation
cv_scores_type = cross_val_score_with_progress(pipeline_result, train_set, target_train_set.squeeze())
mean_score_type = np.mean(cv_scores_type)

print(f'L1 regularisation for fault type prediction: Mean cross-validation accuracy = {mean_score_type:.4f}')

Cross-validation:   0%|          | 0/5 [00:00<?, ?it/s]

Starting fold 1


Cross-validation:  20%|██        | 1/5 [00:08<00:33,  8.34s/it]

Finished fold 1, score: 0.7277974118816937
Starting fold 2


Cross-validation:  40%|████      | 2/5 [00:16<00:24,  8.06s/it]

Finished fold 2, score: 0.7264200384320787
Starting fold 3


Cross-validation:  60%|██████    | 3/5 [00:24<00:16,  8.14s/it]

Finished fold 3, score: 0.7269355586462679
Starting fold 4


Cross-validation:  80%|████████  | 4/5 [00:32<00:08,  8.25s/it]

Finished fold 4, score: 0.7276208905283102
Starting fold 5


Cross-validation: 100%|██████████| 5/5 [00:40<00:00,  8.04s/it]

Finished fold 5, score: 0.726351010864526
L1 regularisation for fault type prediction: Mean cross-validation accuracy = 0.7270





In [None]:
# Fit and score the pipeline on the fault type prediction problem
pipeline_result_2 = Pipeline([('preprocessor', preprocessor), ('classifier', LogisticRegression(penalty='l2', solver='saga', max_iter=2000))])

# Cross-validation
cv_scores_type = cross_val_score_with_progress(pipeline_result_2, train_set, target_train_set.squeeze())
mean_score_type = np.mean(cv_scores_type)

print(f'L1 regularisation for fault type prediction: Mean cross-validation accuracy = {mean_score_type:.4f}')

Cross-validation:   0%|          | 0/5 [00:00<?, ?it/s]

Starting fold 1


Cross-validation:   0%|          | 0/5 [01:49<?, ?it/s]


KeyboardInterrupt: ignored

In [None]:
result_df2['odometerValue'] = pd.to_numeric(result_df2['odometerValue'])

In [None]:
# Create a boolean column where True indicates a 'FAILED' status
result_df2['is_failed'] = result_df2['testResult'] == 'FAILED'

# Convert the 'odometerValue' to absolute values
result_df2['odometerValue'] = result_df2['odometerValue'].abs()

# Sort by 'make', 'model', 'fuelType', 'engineSize', 'odometerValue', 'vehicle_age' and 'is_failed' so that 'FAILED' comes first
result_df2 = result_df2.sort_values(by=['make', 'model', 'fuelType', 'engineSize', 'odometerValue', 'vehicle_age', 'is_failed'], ascending=[True, True, True, True, True, True, False])

# Use groupby to mark the duplicates according to your criteria (we consider rows as duplicates if their odometerValue or vehicle_age are close enough)
result_df2['is_duplicate'] = result_df2.groupby(['make', 'model', 'fuelType', 'engineSize', 'vehicle_age'], group_keys=False)\
                        .apply(lambda group: group['odometerValue'].diff().abs().fillna(0) <= 3) & \
                      result_df2.groupby(['make', 'model', 'fuelType', 'engineSize', 'odometerValue'], group_keys=False)\
                        .apply(lambda group: group['vehicle_age'].diff().abs().fillna(0) <= 0.1)

# Drop the duplicate rows (keep only the rows marked as non-duplicates or rows with 'FAILED' status)
result_df2 = result_df2[~result_df2['is_duplicate'] | result_df2['is_failed']]

# Drop the temporary columns
result_df2 = result_df2.drop(columns=['is_failed', 'is_duplicate'])

KeyboardInterrupt: ignored

In [None]:
result_df2.head(10)

Unnamed: 0,testResult,odometerValue,make,model,fuelType,engineSize,vehicle_age
1079705,FAILED,78344,524 WPL,VAUXHALL,Petrol,1507,44.386301
460696,FAILED,16837,A.C,ACE,Petrol,2553,43.50137
1201532,FAILED,34512,A7,BSA,Petrol,650,48.663014
923844,FAILED,3605,ABARTH,500,Petrol,1368,3.0
601691,PASSED,11286,ABARTH,500,Petrol,1368,2.969863
918988,PASSED,16586,ABARTH,500,Petrol,1368,3.0
851108,FAILED,18153,ABARTH,500,Petrol,1368,5.326027
892634,FAILED,21298,ABARTH,500,Petrol,1368,2.99726
924099,FAILED,21307,ABARTH,500,Petrol,1368,2.989041
1043679,FAILED,23678,ABARTH,500,Petrol,1368,3.065753


In [None]:
# If you want to restore the original order
result_df2 = result_df2.sort_index()

In [None]:
result_df.head(20)

Unnamed: 0,testResult,odometerValue,make,model,fuelType,engineSize,vehicle_age
0,0,85979,CITROEN,C15,Diesel,1769,9.109589
1,1,85978,CITROEN,C15,Diesel,1769,9.109589
2,0,72396,CITROEN,C15,Diesel,1769,8.123288
3,1,72396,CITROEN,C15,Diesel,1769,8.123288
4,0,60546,CITROEN,C15,Diesel,1769,7.142466
5,0,45228,CITROEN,C15,Diesel,1769,5.956164
6,0,33248,CITROEN,C15,Diesel,1769,4.961644
7,1,33248,CITROEN,C15,Diesel,1769,4.958904
8,0,93816,TOYOTA,AURIS,Diesel,1998,7.136986
9,0,57687,TOYOTA,AURIS,Diesel,1998,5.882192


In [None]:
result_df2.head(20)

Unnamed: 0,testResult,odometerValue,make,model,fuelType,engineSize,vehicle_age
1,FAILED,85978,CITROEN,C15,Diesel,1769,9.109589
3,FAILED,72396,CITROEN,C15,Diesel,1769,8.123288
7,FAILED,33248,CITROEN,C15,Diesel,1769,4.958904
10,FAILED,92299,TOYOTA,AURIS,Diesel,1998,5.882192
11,FAILED,56937,TOYOTA,AURIS,Diesel,1998,5.208219
12,PASSED,75424,ROLLS ROYCE,SILVER SHADOW,Petrol,6750,39.558904
14,FAILED,71630,ROLLS ROYCE,SILVER SHADOW,Petrol,6750,38.641096
15,PASSED,71480,ROLLS ROYCE,SILVER SHADOW,Petrol,6750,37.671233
16,FAILED,71480,ROLLS ROYCE,SILVER SHADOW,Petrol,6750,37.643836
17,FAILED,71445,ROLLS ROYCE,SILVER SHADOW,Petrol,6750,37.50137


In [None]:
result_df2['testResult'] = result_df2['testResult'].map(result_mapping)
result_df2.head(10)