###**4. Splitting data into Train and Test sets.**

In [None]:
#importing train_test_split library
from sklearn.model_selection import train_test_split

In [None]:
# defining the independent (X) and dependent (y) variables
X = df_model_features.drop('transmission_from_vin', axis=1)
y = df_model_features['transmission_from_vin']

In [None]:
# splitting the dataset into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
# validating the shape of the train and test sets
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
X_train.head()

In [None]:
X_train.info()

In [None]:
X_test.head()

In [None]:
X_test.info()

In [None]:
y_train.head()

In [None]:
y_test.head()

### **5. Data Pre-processing - Encoding Categorical columns**

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

In [None]:
from category_encoders import BinaryEncoder

###**Encoding X_train**
**i) Encoding the 'make' column**

In [None]:
# checking the unique entries in 'make' column
X_train['make'].unique()

In [None]:
# checking the count of unique entries in 'make' column
X_train['make'].nunique()

In [None]:
# Encoding the 'make' column with BinaryEncoder
be_make = BinaryEncoder(cols = ['make'])
X_train = be_make.fit_transform(X_train)

In [None]:
# confirming the 'make' column has been encoded
X_train.head()

**ii)Encoding the 'model' column**

In [None]:
# checking the unique entries in 'model' column
X_train['model'].unique()

In [None]:
X_train['model'].nunique()

In [None]:
# Encoding the 'model' column with BinaryEncoder
be_model = BinaryEncoder(cols = ['model'])
X_train = be_model.fit_transform(X_train)

In [None]:
X_train.head()

In [None]:
X_train.shape

**iii) Encoding the 'stock_type' column**

In [None]:
# checking the unique entries in the 'stock_type' column
X_train['stock_type'].unique()

In [None]:
# Encoding the stock_type column with LabelEncoder

le_number_stock_type = LabelEncoder()
X_train['stock_type'] = le_number_stock_type.fit_transform(X_train['stock_type'])


In [None]:
#checking 'exterior_color_category' column has been encoded and the 1631 unique entries have been captured in 11 columns
X_train.head()

**vii) Encoding the 'dealer_type' column**

In [None]:
# checking the unique entries in the 'dealer_type' column
X_train['dealer_type'].unique()

In [None]:
# Encoding the dealer_type column with LabelEncoder

le_dealer_type = LabelEncoder()
X_train['dealer_type'] = le_dealer_type.fit_transform(X_train['dealer_type'])

In [None]:
X_train.head()

**viii) Encoding the 'fuel_type_from_vin' column**

In [None]:
# checking the unique entries in the 'fuel_type_from_vin' column
X_train['fuel_type_from_vin'].unique()+-

In [None]:
X_test['fuel_type_from_vin'].unique()

In [None]:
# Encoding the fuel_type_from_vin column with OnehotEncoder
X_train = pd.get_dummies(X_train, columns=['fuel_type_from_vin'], dtype = 'int')

In [None]:
X_train.head()

In [None]:
X_train.shape

### **Encoding X_test**

In [None]:
# checking the unique entries in 'make' column
X_test['make'].unique()

In [None]:
# Encoding the 'make' column with BinaryEncoder
be_make = BinaryEncoder(cols = ['make'])
X_test = be_make.fit_transform(X_test)

In [None]:
# Encoding the 'model' column with BinaryEncoder
be_model = BinaryEncoder(cols = ['model'])
X_test = be_model.fit_transform(X_test)

In [None]:
# Encoding the stock_type column with LabelEncoder

le_number_stock_type = LabelEncoder()
X_test['stock_type'] = le_number_stock_type.fit_transform(X_test['stock_type'])

In [None]:
X_test.head()

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
X_test.info()

### **Encoding y_train & y_test**

In [None]:
y_train = pd.get_dummies(y_train, columns=['transmission_from_vin'], dtype = 'int', drop_first= True)
y_train.head()

In [None]:
y_test = pd.get_dummies(y_test, columns=['y_test'], dtype = 'int', drop_first= True)
y_test.head()

In [None]:
# rename M to transmission_from_vin in y_train and y_test
y_train.rename(columns={'M': 'transmission_from_vin'}, inplace=True)
y_test.rename(columns={'M': 'transmission_from_vin'}, inplace=True)

In [None]:
print(y_train.info())
print(y_test.info())

In [None]:
print(y_train.head())
print(y_test.head())

### **6. Handling Imbalanced data columns**

In [None]:
from ydata_profiling import ProfileReport

In [None]:
profile = ProfileReport(X_train, title="Pandas Profiling Report", explorative=True)
profile

The result of profiling X_train set after encoding shows high imbalance in

model_0
model_1
certified
fuel_type_from_vin_CNG
fuel_type_from_vin_Diesel
fuel_type_from_vin_Electric
fuel_type_from_vin_Hybrid
fuel_type_from_vin_Hydrogen
fuel_type_from_vin_PHEV

In [None]:
#count the number of classes in each imbalanced column in the Train set
print(X_train['model_0'].value_counts())
print(X_train['model_1'].value_counts())
print(X_train['certified'].value_counts())
print(X_train['fuel_type_from_vin_CNG'].value_counts())
print(X_train['fuel_type_from_vin_Diesel'].value_counts())
print(X_train['fuel_type_from_vin_Electric'].value_counts())
print(X_train['fuel_type_from_vin_Hybrid'].value_counts())
print(X_train['fuel_type_from_vin_Hydrogen'].value_counts())
print(X_train['fuel_type_from_vin_PHEV'].value_counts())
print(y_train['transmission_from_vin'].value_counts())


In [None]:
print(X_test.columns)

In [None]:
missing_columns = ['fuel_type_from_vin_CNG']  # Columns that might be missing

for col in missing_columns:
    if col not in X_test.columns:
        X_test[col] = 0  # Add missing column with value 0

In [None]:
#count the number of classes in each imbalanced column in the Test set
print(X_test['model_0'].value_counts())
print(X_test['model_1'].value_counts())
print(X_test['certified'].value_counts())
print(X_test['fuel_type_from_vin_CNG'].value_counts())
print(X_test['fuel_type_from_vin_Diesel'].value_counts())
print(X_test['fuel_type_from_vin_Electric'].value_counts())
print(X_test['fuel_type_from_vin_Hybrid'].value_counts())
print(X_test['fuel_type_from_vin_Hydrogen'].value_counts())
print(X_test['fuel_type_from_vin_PHEV'].value_counts())
print(y_test['transmission_from_vin'].value_counts())

**Visualization of Imbalanced columns in the Train set**

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Create individual countplots for each column
fig, axes = plt.subplots(5, 2, figsize=(15, 20))  # Adjust figsize as needed

sns.countplot(x='model_0', hue = 'model_0', data=X_train, ax=axes[0, 0])
sns.countplot(x='model_1', hue = 'model_1', data=X_train, ax=axes[0, 1])
sns.countplot(x='certified', hue = 'certified', data=X_train, ax=axes[1, 0])
sns.countplot(x='fuel_type_from_vin_CNG', hue = 'fuel_type_from_vin_CNG', data=X_train, ax=axes[1, 1])
sns.countplot(x='fuel_type_from_vin_Diesel', hue = 'fuel_type_from_vin_Diesel', data=X_train, ax=axes[2, 0])
sns.countplot(x='fuel_type_from_vin_Electric', hue = 'fuel_type_from_vin_Electric', data=X_train, ax=axes[2, 1])
sns.countplot(x='fuel_type_from_vin_Hybrid', hue = 'fuel_type_from_vin_Hybrid', data=X_train, ax=axes[3, 0])
sns.countplot(x='fuel_type_from_vin_Hydrogen', hue = 'fuel_type_from_vin_Hydrogen', data=X_train, ax=axes[3, 1])
sns.countplot(x='fuel_type_from_vin_PHEV', hue = 'fuel_type_from_vin_PHEV', data=X_train, ax=axes[4, 0])
sns.countplot(x='transmission_from_vin', hue = 'transmission_from_vin', data=y_train, ax=axes[4, 1])
# clear extra subplots to avoid empty plots
# axes[4, 1].axis('off')
plt.tight_layout()  # Adjust spacing between subplots
plt.show()

**Using SMOTE technique to handle imbalance in Train set**

In [None]:
from imblearn.over_sampling import SMOTE
import pandas as pd
from sklearn.impute import SimpleImputer

# Select specific columns (features to balance)
selected_columns = ['model_0', 'model_1', 'certified', 'fuel_type_from_vin_Diesel',
                    'fuel_type_from_vin_Electric', 'fuel_type_from_vin_Hybrid',
                    'fuel_type_from_vin_PHEV']
X = X_train[selected_columns]
y = y_train['transmission_from_vin']

# Initialize SMOTE
smote = SMOTE(random_state=42, k_neighbors=1)

# Initialize Imputer to replace NaN with most frequent value
imputer = SimpleImputer(strategy='most_frequent')

# Impute missing values in X before resampling
X_imputed = imputer.fit_transform(X)

# Convert imputed numpy array back to DataFrame with the original index
X_imputed = pd.DataFrame(X_imputed, columns=selected_columns, index=X.index)

# Apply SMOTE to balance the target variable
X_resampled, y_resampled = smote.fit_resample(X_imputed, y)

# Instead of concat, use the original index to update the values
X_train_resampled_combined = X_train.copy()  # Create a copy

# Ensure X_resampled index is a subset of X_train_resampled_combined index
# and handle potential NaN values in the index
common_index = X_resampled.index.intersection(X_train_resampled_combined.index)
common_index = common_index[~pd.isnull(common_index)]  # Remove NaN values from the index
X_resampled = X_resampled.loc[common_index]
y_resampled = y_resampled[common_index.astype(int)]  # Ensure y_resampled is also filtered

# Now update using the common index and handle potential NaN values in columns
for col in selected_columns:
    X_train_resampled_combined.loc[common_index, col] = X_resampled.loc[common_index, col].fillna(X_train_resampled_combined[col].mode()[0])

# Ensure X_train_resampled_combined and y_resampled have the same number of samples
X_train_resampled_combined = X_train_resampled_combined.loc[y_resampled.index]

# Now X_train_resampled_combined and y_resampled are ready for further use, with matching sample counts

In [None]:
print(X_train_resampled_combined.shape)
print(y_resampled.shape)

In [None]:
#count the number of classes in each column of the Train set after handling imbalance
print(X_train_resampled_combined['model_0'].value_counts())
print(X_train_resampled_combined['model_1'].value_counts())
print(X_train_resampled_combined['certified'].value_counts())
print(X_train_resampled_combined['fuel_type_from_vin_CNG'].value_counts())
print(X_train_resampled_combined['fuel_type_from_vin_Diesel'].value_counts())
print(X_train_resampled_combined['fuel_type_from_vin_Electric'].value_counts())
print(X_train_resampled_combined['fuel_type_from_vin_Hybrid'].value_counts())
print(X_train_resampled_combined['fuel_type_from_vin_Hydrogen'].value_counts())
print(X_train_resampled_combined['fuel_type_from_vin_PHEV'].value_counts())
print(y_resampled.value_counts())

In [None]:
# Create individual countplots for each column
fig, axes = plt.subplots(5, 2, figsize=(15, 20))

sns.countplot(x='model_0', hue = 'model_0', data=X_train_resampled_combined, ax=axes[0, 0])
sns.countplot(x='model_1', hue = 'model_1', data=X_train_resampled_combined, ax=axes[0, 1])
sns.countplot(x='certified', hue = 'certified', data=X_train_resampled_combined, ax=axes[1, 0])
sns.countplot(x='fuel_type_from_vin_CNG', hue = 'fuel_type_from_vin_CNG', data=X_train_resampled_combined, ax=axes[1, 1])
sns.countplot(x='fuel_type_from_vin_Diesel', hue = 'fuel_type_from_vin_Diesel', data=X_train_resampled_combined, ax=axes[2, 0])
sns.countplot(x='fuel_type_from_vin_Electric', hue = 'fuel_type_from_vin_Electric', data=X_train_resampled_combined, ax=axes[2, 1])
sns.countplot(x='fuel_type_from_vin_Hybrid', hue = 'fuel_type_from_vin_Hybrid', data=X_train_resampled_combined, ax=axes[3, 0])
sns.countplot(x='fuel_type_from_vin_Hydrogen', hue = 'fuel_type_from_vin_Hydrogen', data=X_train_resampled_combined, ax=axes[3, 1])
sns.countplot(x='fuel_type_from_vin_PHEV', hue = 'fuel_type_from_vin_PHEV', data=X_train_resampled_combined, ax=axes[4, 0])
# Convert y_resampled to a DataFrame
y_resampled_df = y_resampled.to_frame()
sns.countplot(x='transmission_from_vin', hue = 'transmission_from_vin', data=y_resampled_df, ax=axes[4, 1])
# clear extra subplots to avoid empty plots
# axes[4, 1].axis('off')
plt.tight_layout()  # Adjust spacing between subplots
plt.show()

**Using SMOTE technique to handle imbalance in Test set**

In [None]:
# Create individual countplots for each column
fig, axes = plt.subplots(5, 2, figsize=(15, 20))

sns.countplot(x='model_0', hue = 'model_0', data=X_test, ax=axes[0, 0])
sns.countplot(x='model_1', hue = 'model_1', data=X_test, ax=axes[0, 1])
sns.countplot(x='certified', hue = 'certified', data=X_test, ax=axes[1, 0])
sns.countplot(x='fuel_type_from_vin_CNG', hue = 'fuel_type_from_vin_CNG', data=X_test, ax=axes[1, 1])
sns.countplot(x='fuel_type_from_vin_Diesel', hue = 'fuel_type_from_vin_Diesel', data=X_test, ax=axes[2, 0])
sns.countplot(x='fuel_type_from_vin_Electric', hue = 'fuel_type_from_vin_Electric', data=X_test, ax=axes[2, 1])
sns.countplot(x='fuel_type_from_vin_Hybrid', hue = 'fuel_type_from_vin_Hybrid', data=X_test, ax=axes[3, 0])
sns.countplot(x='fuel_type_from_vin_Hydrogen', hue = 'fuel_type_from_vin_Hydrogen', data=X_test, ax=axes[3, 1])
sns.countplot(x='fuel_type_from_vin_PHEV', hue = 'fuel_type_from_vin_PHEV', data=X_test, ax=axes[4, 0])
sns.countplot(x='transmission_from_vin', hue = 'transmission_from_vin', data=y_test, ax=axes[4, 1])
# clear extra subplots to avoid empty plots
# axes[4, 1].axis('off')
plt.tight_layout()  # Adjust spacing between subplots
plt.show()

In [None]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer

# Select specific columns (features to balance)
selected_columns = ['model_0', 'model_1', 'certified',
                    'fuel_type_from_vin_Diesel', 'fuel_type_from_vin_Electric',
                    'fuel_type_from_vin_Hybrid',
                    'fuel_type_from_vin_PHEV']
X1 = X_test[selected_columns]
y1 = y_test['transmission_from_vin']

# Initialize SMOTE with k_neighbors=1
smote = SMOTE(random_state=42, k_neighbors=1)

# Initialize Imputer to replace NaN with most frequent value
imputer = SimpleImputer(strategy='most_frequent')

# Impute missing values in X1
X1_imputed = pd.DataFrame(imputer.fit_transform(X1), columns=X1.columns)

# Create a copy of X to store resampled data
X1_resampled = X1_imputed.copy()

# Apply SMOTE to each selected feature, but skip if insufficient minority samples
for feature in selected_columns:
    temp_y1 = X1_imputed[feature]
    unique_values, counts = np.unique(temp_y1, return_counts=True)
    minority_class_count = counts.min()

    if minority_class_count > smote.k_neighbors:
        X1_feature_resampled, _ = smote.fit_resample(X1_imputed, temp_y1)
        X1_resampled[feature] = X1_feature_resampled[feature]
    else:
        print(f"Skipping SMOTE for feature '{feature}' due to insufficient minority class samples.")

# Apply SMOTE to balance the target variable
X1_resampled, y1_resampled = smote.fit_resample(X1_resampled, y1)

X_test_remaining = X_test.drop(columns=selected_columns)
X_test_resampled_combined = pd.concat([X_test_remaining, X1_resampled], axis=1)

# If rows don't match, adjust X_test_resampled_combined
if X_test_resampled_combined.shape[0] != y1_resampled.shape[0]:
    num_rows_to_adjust = y1_resampled.shape[0] - X_test_resampled_combined.shape[0]

    if num_rows_to_adjust > 0:  # Need to add rows
        additional_rows = X_test_remaining.sample(n=num_rows_to_adjust, replace=True, random_state=42)
        X_test_remaining = pd.concat([X_test_remaining, additional_rows], ignore_index=True)
        X_test_resampled_combined = pd.concat([X_test_remaining, X1_resampled], axis=1)

    elif num_rows_to_adjust < 0:  # Need to remove rows
        # Remove extra rows from X_test_resampled_combined
        X_test_resampled_combined = X_test_resampled_combined.iloc[:y1_resampled.shape[0]]

print(f"Shape of X_test_resampled_combined: {X_test_resampled_combined.shape}")
print(f"Shape of y1_resampled: {y1_resampled.shape}")

In [None]:
Shape of X_test_resampled_combined: (54934, 30)
Shape of y1_resampled: (54934,)

In [None]:
print(f"Shape of X_train_resampled_combined: {X_train_resampled_combined.shape}")
print(f"Shape of y_resampled: {y_resampled.shape}")

In [None]:
# Create individual countplots for each column
fig, axes = plt.subplots(5, 2, figsize=(15, 20))

sns.countplot(x='model_0', hue = 'model_0', data=X_test_resampled_combined, ax=axes[0, 0])
sns.countplot(x='model_1', hue = 'model_1', data=X_test_resampled_combined, ax=axes[0, 1])
sns.countplot(x='certified', hue = 'certified', data=X_test_resampled_combined, ax=axes[1, 0])
sns.countplot(x='fuel_type_from_vin_CNG', hue = 'fuel_type_from_vin_CNG', data=X_test_resampled_combined, ax=axes[1, 1])
sns.countplot(x='fuel_type_from_vin_Diesel', hue = 'fuel_type_from_vin_Diesel', data=X_test_resampled_combined, ax=axes[2, 0])
sns.countplot(x='fuel_type_from_vin_Electric', hue = 'fuel_type_from_vin_Electric', data=X_test_resampled_combined, ax=axes[2, 1])
sns.countplot(x='fuel_type_from_vin_Hybrid', hue = 'fuel_type_from_vin_Hybrid', data=X_test_resampled_combined, ax=axes[3, 0])
sns.countplot(x='fuel_type_from_vin_Hydrogen', hue = 'fuel_type_from_vin_Hydrogen', data=X_test_resampled_combined, ax=axes[3, 1])
sns.countplot(x='fuel_type_from_vin_PHEV', hue = 'fuel_type_from_vin_PHEV', data=X_test_resampled_combined, ax=axes[4, 0])

# Convert y1_resampled to a DataFrame before using it in sns.countplot
y1_resampled_df = y1_resampled.to_frame()

# Now use the DataFrame in sns.countplot
sns.countplot(x='transmission_from_vin', hue='transmission_from_vin', data=y1_resampled_df, ax=axes[4, 1])

# clear extra subplots to avoid empty plots
# axes[4, 1].axis('off')
plt.tight_layout()  # Adjust spacing between subplots
plt.show()

In [None]:
#count the number of classes in each column of the Train set after handling imbalance
print(X_test_resampled_combined['model_0'].value_counts())
print(X_test_resampled_combined['model_1'].value_counts())
print(X_test_resampled_combined['certified'].value_counts())
print(X_test_resampled_combined['fuel_type_from_vin_CNG'].value_counts())
print(X_test_resampled_combined['fuel_type_from_vin_Diesel'].value_counts())
print(X_test_resampled_combined['fuel_type_from_vin_Electric'].value_counts())
print(X_test_resampled_combined['fuel_type_from_vin_Hybrid'].value_counts())
print(X_test_resampled_combined['fuel_type_from_vin_Hydrogen'].value_counts())
print(X_test_resampled_combined['fuel_type_from_vin_PHEV'].value_counts())
print(y_resampled.value_counts())

In [None]:
X_test_resampled_combined.head()

In [None]:
y1_resampled.head()

In [None]:
print(X_train_resampled_combined.shape)
print(X_test_resampled_combined.shape)
print(y_resampled.shape)
print(y1_resampled.shape)

## **7. Scaling Test and Train Sets**
### The purpose of scaling is to bring all features (variables) into a common range or distribution. This can improve the performance and convergence speed of machine learning algorithms

### **Scaling X_train**

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Initialize MinMaxScaler with desired range (default is 0 to 1)
scaler = MinMaxScaler()

# Fit and transform the data
scaled_X_train_resampled_combined = scaler.fit_transform(X_train_resampled_combined)

# Convert the result back to a DataFrame
scaled_X_train_resampled_combined = pd.DataFrame(scaled_X_train_resampled_combined, columns=X_train_resampled_combined.columns)

print("Scaled Data:")
scaled_X_train_resampled_combined.head()


### **Scaling X_test**

In [None]:
# Initialize MinMaxScaler with desired range (default is 0 to 1)
scaler2 = MinMaxScaler()

# Fit and transform the data
scaled_X_test_resampled_combined = scaler2.fit_transform(X_test_resampled_combined)

# Get column names from the original DataFrame
#columns = X_test.columns

# Convert the result back to a DataFrame using the original column names
scaled_X_test_resampled_combined = pd.DataFrame(scaled_X_test_resampled_combined, columns=X_test_resampled_combined.columns)

print("Scaled Data:")
scaled_X_test_resampled_combined.head()

In [None]:
# scaling y1_resampled_df
scaler3 = MinMaxScaler()

# Fit and transform the data
scaled_y1_resampled_df = scaler3.fit_transform(y1_resampled_df)

In [None]:
# scaling y1_resampled_df
scaler4 = MinMaxScaler()

# Fit and transform the data
scaled_y_resampled_df = scaler4.fit_transform(y_resampled_df)

In [None]:
print(scaled_X_train_resampled_combined.shape)
print(scaled_X_test_resampled_combined.shape)

In [None]:
print(y_resampled.shape) # train
print(y1_resampled.shape) # test