In [3]:
import joblib
import pandas as pd
from sklearn.preprocessing import LabelEncoder

## Aligning Test Dataset Columns with Train Data

In [5]:
test_data = joblib.load('Encoded_Test_Data.joblib')

In [15]:
train_data = joblib.load('Encoded_Train_Data.joblib')

In [9]:
test_data.shape

(34494, 78)

In [17]:
train_data.shape

(34600, 80)

In [19]:
print(train_data.shape)
print(test_data.shape)

(34600, 80)
(34494, 78)


 **Test datasets are aligned in terms of their columns from Train datasets**

In [None]:
train_data, test_data = train_data.align(test_data, join='left', axis=1)
test_data.fillna(0, inplace=True)

In [23]:
test_data.shape

(34494, 80)

In [25]:
test_data = test_data.reset_index(drop=True)

In [27]:
print(train_data.shape)
print(test_data.shape)

(34600, 80)
(34494, 80)


## Target Encoding

In [30]:
a=test_data['IncidentGrade'].unique()
b=train_data['IncidentGrade'].unique()
a

array(['FalsePositive', 'BenignPositive', 'TruePositive'], dtype=object)

In [32]:
b

array(['BenignPositive', 'TruePositive', 'FalsePositive'], dtype=object)

## Unified Label Encoding: Combining, Fitting, and Transforming Labels in the Target Data

In [34]:
#combining the unique labels from both datasets
all_labels = list(set(train_data['IncidentGrade']).union(set(test_data['IncidentGrade'])))

#fiting the LabelEncoder on the combined labels
le = LabelEncoder()
le.fit(all_labels)

#transforming the labels
train_data['IncidentGrade'] = le.transform(train_data['IncidentGrade'])
test_data['IncidentGrade'] = le.transform(test_data['IncidentGrade'])

print("Train encoded labels:", train_data['IncidentGrade'].unique())
print("Test encoded labels:", test_data['IncidentGrade'].unique())
print("Label mapping:", dict(zip(le.classes_, le.transform(le.classes_))))


Train encoded labels: [0 2 1]
Test encoded labels: [1 0 2]
Label mapping: {'BenignPositive': 0, 'FalsePositive': 1, 'TruePositive': 2}


### Saving the Train & Test data using Joblib

In [36]:
joblib.dump(test_data, 'Encoded_Test_Data_F.joblib')

['Encoded_Test_Data_F.joblib']

In [38]:
joblib.dump(train_data, 'Encoded_Train_Data_F.joblib')

['Encoded_Train_Data_F.joblib']