<iframe src="https://archive.ics.uci.edu/dataset/19/car+evaluation" width="90%" height="500px"></iframe>

In [165]:
import optuna
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [147]:
# define column names
column_names: list[str] = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']

# Load the data from car.data file
df_car_data = pd.read_csv('../MLflow-project/datasets/car_evaluation/car.data',
                          header = None,
                          names = column_names)

df_car_data.head(10)              # df_car_data.head() only gets 1st 5 rows

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
5,vhigh,vhigh,2,2,med,high,unacc
6,vhigh,vhigh,2,2,big,low,unacc
7,vhigh,vhigh,2,2,big,med,unacc
8,vhigh,vhigh,2,2,big,high,unacc
9,vhigh,vhigh,2,4,small,low,unacc


In [148]:
df_car_data.columns

Index(['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class'], dtype='object')

In [149]:
df_car_data.info()              # gives information about column "datatypes" and if any null columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1728 non-null   object
 1   maint     1728 non-null   object
 2   doors     1728 non-null   object
 3   persons   1728 non-null   object
 4   lug_boot  1728 non-null   object
 5   safety    1728 non-null   object
 6   class     1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [150]:
df_car_data.shape           # (1728, 7) --> 1728 rows and 7 columns

(1728, 7)

In [151]:
df_car_data.isnull().sum()              # checks if any of the columns has a value >= "1" which denotes null values

buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
class       0
dtype: int64

In [152]:
df_car_data.isnull().any()              # determines if any columns has null values

buying      False
maint       False
doors       False
persons     False
lug_boot    False
safety      False
class       False
dtype: bool

In [160]:
# class of "acceptable_car"
acceptable_car = df_car_data[df_car_data['doors'] == 'acc']
acceptable_car

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
81,vhigh,vhigh,5more,2,small,low,unacc
82,vhigh,vhigh,5more,2,small,med,unacc
83,vhigh,vhigh,5more,2,small,high,unacc
84,vhigh,vhigh,5more,2,med,low,unacc
85,vhigh,vhigh,5more,2,med,med,unacc
...,...,...,...,...,...,...,...
1723,low,low,5more,more,med,med,good
1724,low,low,5more,more,med,high,vgood
1725,low,low,5more,more,big,low,unacc
1726,low,low,5more,more,big,med,good


In [164]:
df_car_data['persons'].value_counts()

persons
2       576
4       576
more    576
Name: count, dtype: int64

In [154]:
# divide data into train (input / features) & test (output) data
from pandas.core.frame import DataFrame, Series

X: DataFrame = df_car_data.drop('class', axis = 1)
y: Series = df_car_data['class']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.3,
                                                    random_state = 42)

print('Features or input variables:\n', X.head(), '\nand shape is ', X.shape)
print('\nTarget or ouput variables:\n', y.head(), '\nand shape is ', y.shape)

Features or input variables:
   buying  maint doors persons lug_boot safety
0  vhigh  vhigh     2       2    small    low
1  vhigh  vhigh     2       2    small    med
2  vhigh  vhigh     2       2    small   high
3  vhigh  vhigh     2       2      med    low
4  vhigh  vhigh     2       2      med    med 
and shape is  (1728, 6)

Target or ouput variables:
 0    unacc
1    unacc
2    unacc
3    unacc
4    unacc
Name: class, dtype: object 
and shape is  (1728,)


- It's generally better to perform "one-hot encoding" and "label encoding" after splitting the data into training and test sets. 

- This approach ensures that the encoding is based only on the training data, which helps prevent data leakage and ensures that the model generalizes well to unseen data.

**Why Encode After Splitting?**

#### Prevent Data Leakage: Encoding before splitting can lead to data leakage, where information from the test set influences the training process. This can result in overly optimistic performance estimates.

- **Data leakage** occurs when information from outside the training dataset is used to create the model.

- Consistent Encoding: By encoding after splitting, you ensure that the encoding is consistent and based only on the training data. This helps the model generalize better to new, unseen data.

#### Correct Approach:

- Split the Data: First, split the data into training and test sets.

- Encode the Training Data: Perform one-hot encoding and label encoding on the training data.

- Align the Test Data: Ensure that the test data has the same encoding as the training data.

In [155]:
print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

(1209, 6)
(1209,)
(519, 6)
(519,)


#### Applying **$One-Hot-Encoding$**:

1. Convert categorical variables into dummy / indicator variables

2. One-hot encoding (using pd.get_dummies) converts each category into a separate binary column, ensuring that the model treats each category independently without implying any order

3. Most machine learning algorithms cannot handle categorical variables directly. They require numerical input

4. Hence,converting categorical variables to dummy variables (one-hot encoding) transforms them into a format that algorithms can process.

In [156]:
# # Encode the input variables (features) - learnt on 12-Jan-2025, pandas encoding using "pd.get_dummies" is waste and temporary for just 1 line
# X_train_encoded = pd.get_dummies(X_train)
# X_test_encoded = pd.get_dummies(X_test)

# # Align the columns of the test set with the train set
# X_test_encoded = X_test_encoded.reindex(columns = X_train_encoded.columns, 
#                                         fill_value = 0)

# # Optional: Scale the data for better model performance
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train_encoded)
# X_test_scaled = scaler.transform(X_test_encoded)

# print('Scaled X_train:')
# print(X_train_scaled[:5])       # prints a large vectorised array like [[-0.57122289 -0.58124945 -0.58793451  1.7574861  -0.59016327 -0.56453724

# print('\nScaled y_test:')
# print(X_test_scaled[:5])

Scaled X_train:
[[-0.5732115  -0.57066443  1.71035805 -0.58085229 -0.57957881 -0.56811716
   1.71409181 -0.57830536 -0.59231578 -0.57193799 -0.56811716  1.73300642
  -0.69398678  1.37562513 -0.70053705  1.408966   -0.69529532 -0.7163389
   1.39598728 -0.71237722 -0.69267897]
 [ 1.74455675 -0.57066443 -0.5846729  -0.58085229  1.72539088 -0.56811716
  -0.58339932 -0.57830536 -0.59231578  1.7484413  -0.56811716 -0.57703191
  -0.69398678 -0.72694223  1.42747625 -0.70974034 -0.69529532  1.39598728
  -0.7163389   1.40375068 -0.69267897]
 [-0.5732115   1.75234331 -0.5846729  -0.58085229 -0.57957881 -0.56811716
   1.71409181 -0.57830536 -0.59231578  1.7484413  -0.56811716 -0.57703191
  -0.69398678  1.37562513 -0.70053705 -0.70974034  1.43823778 -0.7163389
  -0.7163389  -0.71237722  1.44367021]
 [-0.5732115  -0.57066443  1.71035805 -0.58085229 -0.57957881 -0.56811716
   1.71409181 -0.57830536 -0.59231578 -0.57193799 -0.56811716  1.73300642
   1.44094964 -0.72694223 -0.70053705  1.408966   -0.69

In [157]:
# Encode the target variables
from sklearn.preprocessing import LabelEncoder
lablel_encoder = LabelEncoder()

y_train_encoded = lablel_encoder.fit_transform(y_train)
y_test_encoded = lablel_encoder.transform(y_test)

print('Encoded y_train:')
print(y_train_encoded[:5])              # prints [2 2 2 2 2] --> means class values like "aac", "good" now got converted to nums

print('\nEncoded y_test:')
print(y_test_encoded[:5])

Encoded y_train:
[3 2 0 2 2]

Encoded y_test:
[2 0 2 0 2]


In [159]:
df_car_data.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
