# Data Preprocessing Pipeline

## Importing the libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Modelling
from sklearn.metrics import precision_score,f1_score,recall_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
import warnings


## Importing the CSV file

In [3]:
df = pd.read_csv(r'C:\Users\NATHAN\F1FINAL\Data\processed dataset\f1.csv')
df.drop('Unnamed: 0',axis=1,inplace=True)
df.head()

Unnamed: 0,Year,Race Name,Driver ID,Constructor Name,Grid Position,Final Position,Fastest Lap Time,Points,Status,Number of Laps,Lap,Duration,Weather,Temperature (°C),Humidity (%),Wind Speed (m/s),Rain (mm),Pit Stops
0,2024,Au,le,Fe,4.0,2.0,79.813,19.0,Finished,58.0,34.0,17.308,No rain,17.76,67,0.89,0.0,2
1,2019,Az,al,To,11.0,11.0,105.754,0.0,+1 Lap,50.0,12.0,20.72,Rain,4.97,75,12.35,0.0,1
2,2020,Br,ra,,,,,,DNF,,47.0,44.085,No rain,4.18,97,2.28,0.0,2
3,2022,Hu,sa,Fe,2.0,4.0,,12.0,Finished,70.0,17.0,22.82,No rain,1.26,70,3.09,0.0,1
4,2024,Au,ts,RB,8.0,7.0,81.134,6.0,Finished,58.0,36.0,17.535,No rain,17.76,67,0.89,0.0,2


In [4]:
#Creating the dependent and independent variables
X = df.drop(['Pit Stops'],axis=1)
y = df['Pit Stops']

In [5]:
X.head()

Unnamed: 0,Year,Race Name,Driver ID,Constructor Name,Grid Position,Final Position,Fastest Lap Time,Points,Status,Number of Laps,Lap,Duration,Weather,Temperature (°C),Humidity (%),Wind Speed (m/s),Rain (mm)
0,2024,Au,le,Fe,4.0,2.0,79.813,19.0,Finished,58.0,34.0,17.308,No rain,17.76,67,0.89,0.0
1,2019,Az,al,To,11.0,11.0,105.754,0.0,+1 Lap,50.0,12.0,20.72,Rain,4.97,75,12.35,0.0
2,2020,Br,ra,,,,,,DNF,,47.0,44.085,No rain,4.18,97,2.28,0.0
3,2022,Hu,sa,Fe,2.0,4.0,,12.0,Finished,70.0,17.0,22.82,No rain,1.26,70,3.09,0.0
4,2024,Au,ts,RB,8.0,7.0,81.134,6.0,Finished,58.0,36.0,17.535,No rain,17.76,67,0.89,0.0


In [9]:
#Creating the split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,stratify=df['Pit Stops'])

In [10]:
print("Categories in 'Race Name' variable:     ",end=" " )
print(df['Race Name'].unique())

print("Categories in 'Driver ID' variable:     ",end=" " )
print(df['Driver ID'].unique())

print("Categories in 'Constructor Name' variable:     ",end=" " )
print(df['Constructor Name'].unique())

print("Categories in 'Status' variable:     ",end=" " )
print(df['Status'].unique())

print("Categories in 'Weather' variable:     ",end=" " )
print(df['Weather'].unique())


Categories in 'Race Name' variable:      ['Au' 'Az' 'Br' 'Hu' 'It' 'Du' 'Sã' 'Be' 'Po' 'Mi' 'Ru' 'Em' 'La' 'Ab'
 'Sa' 'Ja' 'Fr' 'Ca' '70' 'Mo' 'Sp' 'Ge' 'Ba' 'Me' 'Tu' 'Ch' 'Si' 'Qa'
 'Un' 'St' 'Ei']
Categories in 'Driver ID' variable:      ['le' 'al' 'ra' 'sa' 'ts' 'st' 'kv' 'no' 'ri' 've' 'ma' 'zh' 'ru' 'ha'
 'mi' 'bo' 'ga' 'la' 'oc' 'pi' 'pe' 'co' 'gr' 'gi' 'ke' 'hu' 'ku' 'de'
 'be' 'ai']
Categories in 'Constructor Name' variable:      ['Fe' 'To' nan 'RB' 'As' 'Mc' 'Al' 'Wi' 'Me' 'Sa' 'Re' 'Ha' 'Ra']
Categories in 'Status' variable:      ['Finished' '+1 Lap' 'DNF' '+2 Laps']
Categories in 'Weather' variable:      ['No rain' 'Rain']


In [17]:
y

0        2
1        1
2        2
3        1
4        2
        ..
18157    2
18158    3
18159    2
18160    2
18161    2
Name: Pit Stops, Length: 18162, dtype: int64

In [12]:
#Creating the Preprocessing Pipeline
from sklearn.preprocessing import RobustScaler,OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from feature_engine.outliers import Winsorizer
from feature_engine.encoding import RareLabelEncoder
from feature_engine.selection import DropDuplicateFeatures

#Defining numerical and categorical columns
numeric_columns = ["Grid Position", "Final Position", "Fastest Lap Time", "Points", "Number of Laps", 
                   "Temperature (°C)", "Humidity (%)", "Wind Speed (m/s)", "Rain (mm)"]
low_cardinality_cat = ["Status", "Weather"]
high_cardinality_cat = ["Race Name","Driver ID", "Constructor Name"]

#Numerical Pipeline
num_transformer = Pipeline(steps=[
    ("drop_duplicates",DropDuplicateFeatures()),
    ("Imputer",SimpleImputer(strategy='median')),
    ("outlier_removal",Winsorizer(capping_method='gaussian',tail='both',fold=2.0)),
    ("scaler",RobustScaler())
])

#Categorical Pipeline
low_cardinality_transformer = Pipeline(steps=[
    ("imputer",SimpleImputer(strategy='most_frequent')),
    ("rare_label",RareLabelEncoder(tol=0.05,replace_with="Other")),
    ("onehot",OneHotEncoder(drop='first'))

])

high_cardinality_transformer = Pipeline(steps=[
    ("imputer",SimpleImputer(strategy='most_frequent')),
    ("label_encoding",RareLabelEncoder(tol=0.05,replace_with="Other"))
])

#Combining the transformers
preprocessor = ColumnTransformer(transformers=[
    ("num",num_transformer,numeric_columns),
    ("low_card_cat",low_cardinality_transformer,low_cardinality_cat),
    ("high_card_cat",high_cardinality_transformer,high_cardinality_cat)
],remainder='drop')




In [13]:
#Fit and Transform Data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)



## Model Training and Evaluation

In [18]:
#Creating a function to evaluate models
def evaluate_model(true,predicted):
    precision = precision_score(true,predicted,average='weighted')
    recall = recall_score(true,predicted,average='weighted')
    f1 = f1_score(true,predicted,average='weighted')

    return precision,recall,f1

In [19]:
models = {
    "Logistic Regression": LogisticRegression(),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "XGBClassifier": XGBClassifier(),
    "AdaBoostClassifier": AdaBoostClassifier(),
    "Support Vector Classifier": SVC()
}

model_list = []
precision_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train_transformed,y_train) #Train model

    #Make Predictions
    y_train_pred = model.predict(X_train_transformed)
    y_test_pred = model.predict(X_test_transformed)

    #Evaluate Train and Test set
    model_train_precision,model_train_recall,model_train_f1 = evaluate_model(
        y_train,y_train_pred
    )
    model_test_precision,model_test_recall,model_test_f1 = evaluate_model(
        y_test,y_test_pred
    )

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model performance for Training set')
    print("- Weighted Precision Score: {:.4f}".format(model_train_precision))
    print("- Weighted Recall Score: {:.4f}".format(model_train_recall))
    print("- F1 Score: {:.4f}".format(model_train_f1))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Weighted Precision Score: {:.4f}".format(model_test_precision))
    print("- Weighted Recall Score: {:.4f}".format(model_test_recall))
    print("- F1 Score: {:.4f}".format(model_test_f1))
    precision_list.append(model_test_precision)
    
    print('='*35)
    print('\n')



ValueError: could not convert string to float: 'Other'