# TO-DO: 
* Salary shouldn't use One Hot Encoding it should pre processed via a bianry map in the same manner as sex since salary is a binary label (only has 2 values)

# SVM Implementation

In [5]:
# !pip install matplotlib

In [1]:
#Importing packages
import kagglehub
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import ast
import pickle
from sklearn.preprocessing import StandardScaler

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
#Download the dataset
path = kagglehub.dataset_download("ivanhrek/uci-adult")

Downloading from https://www.kaggle.com/api/v1/datasets/download/ivanhrek/uci-adult?dataset_version_number=1...


100%|██████████| 434k/434k [00:00<00:00, 830kB/s]

Extracting files...





In [6]:
#Retrieving the path to the dataset
for dirname, _, filenames in os.walk(path):
    for filename in filenames:
        path = os.path.join(dirname, filename)

In [179]:
#Displaying the first 5 rows of the dataset
df = pd.read_csv(path)
df.head(5)
df.size

488415

# Data Cleaning

Remove Duplicate Rows

In [32]:
#Removing duplicate rows
print("Dataset Size before Deduplication: ", df.size)
deDupDF = df.drop_duplicates(inplace=False)
print("Dataset Size after Deduplication: ", deDupDF.size)

Dataset Size before Deduplication:  488415
Dataset Size after Deduplication:  488055


Remove Rows containing Null Values 

In [33]:
print("Dataset Size after Null Removal: ", deDupDF.size)
# Replace all '?' with NaN
noNullDF = deDupDF.replace('?', np.nan, inplace=False)

# Drop rows with any NaN values
noNullDF = noNullDF.dropna(inplace=False)

# Optionally, reset index after dropping rows
noNullDF = noNullDF.reset_index(drop=True, inplace=False)
print("Dataset Size after Null Removal: ", noNullDF.size)

Dataset Size after Null Removal:  488055
Dataset Size after Null Removal:  452085


Removing native-country column due to severe skewness

In [35]:
print("Dataset Size before native-country Removal: ", noNullDF.size)
noCountryDF = noNullDF[noNullDF["native-country"] == "United-States"]

noCountryDF = noCountryDF.drop(columns=["native-country"], inplace=False)
print("Dataset Size after native-country Removal: ", noCountryDF.size)

Dataset Size before native-country Removal:  452085
Dataset Size after native-country Removal:  384818


Removing fnlwgt & education column

* fnlwgt - This is a redundent column thus it was removed

* education - This is not needed as its already ordinally encoded in the education-num column (**EDUCATION might need to be encoded instead of jsut dropped mainly to be able to process the input data correctly**)

In [45]:
print("Original Columns: ", noCountryDF.columns)
noExtraColumnsDF = noCountryDF.drop(columns=["fnlwgt", "education"], inplace=False)
print("New Columns: ", noExtraColumnsDF.columns)

Original Columns:  Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'salary'],
      dtype='object')
New Columns:  Index(['age', 'workclass', 'education-num', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
       'hours-per-week', 'salary'],
      dtype='object')


One-Hot-Encoding column

* This was used as the workclass, marital-status, occupation, relationship, race, sex & salary are all non-oridnal categorical classes.

* drop_first drops one of the columns created from OHE to avoid the **dummy variable trap**, which ocurs when you have **multicollinearity**: When one of the new columns can be perfectly predicted by the others (i.e., if you know the values of n-1 columns, you can deduce the nth column). This can distort the model’s interpretation and lead to redundant information.

In [58]:
# Applying one-hot encoding to multiple categorical columns with drop_first to avoid the dummy variable trap
oHEDF = pd.get_dummies(noExtraColumnsDF, columns=['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'salary'], drop_first=True)

print("Original Columns: ", noExtraColumnsDF.columns)

print("New Columns: ", oHEDF.columns)
oHEDF.head(5)


Original Columns:  Index(['age', 'workclass', 'education-num', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
       'hours-per-week', 'salary'],
      dtype='object')
New Columns:  Index(['age', 'education-num', 'sex', 'capital-gain', 'capital-loss',
       'hours-per-week', 'workclass_Local-gov', 'workclass_Private',
       'workclass_Self-emp-inc', 'workclass_Self-emp-not-inc',
       'workclass_State-gov', 'workclass_Without-pay',
       'marital-status_Married-AF-spouse', 'marital-status_Married-civ-spouse',
       'marital-status_Married-spouse-absent', 'marital-status_Never-married',
       'marital-status_Separated', 'marital-status_Widowed',
       'occupation_Armed-Forces', 'occupation_Craft-repair',
       'occupation_Exec-managerial', 'occupation_Farming-fishing',
       'occupation_Handlers-cleaners', 'occupation_Machine-op-inspct',
       'occupation_Other-service', 'occupation_Priv-house-serv',
       'occupation_Pro

Unnamed: 0,age,education-num,sex,capital-gain,capital-loss,hours-per-week,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,...,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Asian-Pac-Islander,race_Black,race_Other,race_White,salary_>50K
0,39,13,Male,2174,0,40,False,False,False,False,...,True,False,False,False,False,False,False,False,True,False
1,50,13,Male,0,0,13,False,False,False,True,...,False,False,False,False,False,False,False,False,True,False
2,38,9,Male,0,0,40,False,True,False,False,...,True,False,False,False,False,False,False,False,True,False
3,53,7,Male,0,0,40,False,True,False,False,...,False,False,False,False,False,False,True,False,False,False
5,37,14,Female,0,0,40,False,True,False,False,...,False,False,False,False,True,False,False,False,True,False


Sex column was cahnged to binary value

* This replicates the same process as use OHE with drop_first=True

In [59]:
# Convert 'sex' column to binary: Male -> 1, Female -> 0
binarySexDF = oHEDF.copy()
binarySexDF['sex'] = binarySexDF['sex'].map({'Male': 1, 'Female': 0})

# Check the first few rows to confirm the conversion
binarySexDF.head(5)


Unnamed: 0,age,education-num,sex,capital-gain,capital-loss,hours-per-week,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,...,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Asian-Pac-Islander,race_Black,race_Other,race_White,salary_>50K
0,39,13,1,2174,0,40,False,False,False,False,...,True,False,False,False,False,False,False,False,True,False
1,50,13,1,0,0,13,False,False,False,True,...,False,False,False,False,False,False,False,False,True,False
2,38,9,1,0,0,40,False,True,False,False,...,True,False,False,False,False,False,False,False,True,False
3,53,7,1,0,0,40,False,True,False,False,...,False,False,False,False,False,False,True,False,False,False
5,37,14,0,0,0,40,False,True,False,False,...,False,False,False,False,True,False,False,False,True,False


In [60]:
binarySexDF = binarySexDF.astype(int)
binarySexDF.head(5)

Unnamed: 0,age,education-num,sex,capital-gain,capital-loss,hours-per-week,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,...,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Asian-Pac-Islander,race_Black,race_Other,race_White,salary_>50K
0,39,13,1,2174,0,40,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
1,50,13,1,0,0,13,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
2,38,9,1,0,0,40,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
3,53,7,1,0,0,40,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
5,37,14,0,0,0,40,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0


# Applying log1p to highly skewed columns to reduce skewness

* OHE columns can be ignored, as skewness is relevant only to traditional continuous values

* Skewness > 1 or < -1 is considered as highly skewed

* log1p is used as it handles 0 values safely

In [70]:
# Getting only continuous columns
continuous_columns = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
continuousDF = binarySexDF[continuous_columns]

# Calculate skewness for continuous columns
skewness = continuousDF.skew()

# Checking skewness of the data
print(skewness)

# Filter columns where skewness > 1 or skewness < -1
high_skew_columns = skewness[skewness > 1].index.tolist() + skewness[skewness < -1].index.tolist()

print("\nColumns with skewness > 1 or < -1:", high_skew_columns)

reducedSkewDF = binarySexDF.copy()
for column in high_skew_columns:
    reducedSkewDF[column] = np.log1p(reducedSkewDF[column])

reducedSkewDF.head(5)


age                0.516800
education-num     -0.076786
capital-gain      11.741257
capital-loss       4.474207
hours-per-week     0.319131
dtype: float64

Columns with skewness > 1 or < -1: ['capital-gain', 'capital-loss']


Unnamed: 0,age,education-num,sex,capital-gain,capital-loss,hours-per-week,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,...,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Asian-Pac-Islander,race_Black,race_Other,race_White,salary_>50K
0,39,13,1,7.684784,0.0,40,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
1,50,13,1,0.0,0.0,13,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
2,38,9,1,0.0,0.0,40,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
3,53,7,1,0.0,0.0,40,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
5,37,14,0,0.0,0.0,40,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0


Z-Score Normalisation

* This is used to rescales the data to have a **mean of 0** and a **standard deviation of 1**. 

* Useful for models sensitive to feature scales, like SVMs, logistic regression, and k-means clustering.

In [73]:
# Initialize the scaler
scaler = StandardScaler()

# List of continuous columns to standardize
continuous_columns = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

# Apply standardization
zNormalisedDF = reducedSkewDF.copy()

zNormalisedDF[continuous_columns] = scaler.fit_transform(zNormalisedDF[continuous_columns])

zNormalisedDF.head(5)

scaler_name = 'z-score_scaler_svm_Salary_Classification'

# Save the scaler using pickle to be used for unseen data
with open('models/'+scaler_name+'.pkl', 'wb') as f:
    pickle.dump(scaler, f)


Unnamed: 0,age,education-num,sex,capital-gain,capital-loss,hours-per-week,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,...,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Asian-Pac-Islander,race_Black,race_Other,race_White,salary_>50K
0,0.037475,1.167453,1,2.7757,-0.225273,-0.081047,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
1,0.872027,1.167453,1,-0.304276,-0.225273,-2.323808,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
2,-0.038394,-0.513895,1,-0.304276,-0.225273,-0.081047,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
3,1.099633,-1.354569,1,-0.304276,-0.225273,-0.081047,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
5,-0.114262,1.58779,0,-0.304276,-0.225273,-0.081047,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0


# SVM Setup & Training

Split the dataset into features (X) and label (y)

In [85]:
label = 'salary_>50K'

x = zNormalisedDF.drop(columns=[label]) 
y = zNormalisedDF['salary_>50K']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

svm_model = SVC(kernel='linear')  
svm_model.fit(X_train, y_train)

# Predict on the test set
y_pred = svm_model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))  # Confusion matrix

Accuracy: 0.8457620953073846
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.92      0.90      4135
           1       0.72      0.62      0.67      1363

    accuracy                           0.85      5498
   macro avg       0.80      0.77      0.78      5498
weighted avg       0.84      0.85      0.84      5498

Confusion Matrix:
 [[3807  328]
 [ 520  843]]


# HyperParameter Tuning - One vs Many Approach

In [87]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Step 1: Split the data into training and validation sets
X = zNormalisedDF.drop(columns=['salary_>50K']) 
y = zNormalisedDF['salary_>50K'] 

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'kernel': ['linear', 'rbf', 'poly'],  # Kernel types
    'gamma': ['scale', 'auto', 0.1, 0.01],  # Kernel coefficient for 'rbf', 'poly' kernels
    'degree': [3, 4, 5]  # Degree of the polynomial kernel (only used for 'poly')
}

# Step 3: Initialize GridSearchCV
grid_search = GridSearchCV(estimator=SVC(), param_grid=param_grid, scoring='accuracy', cv=5, verbose=1, n_jobs=-1)

# Step 4: Fit GridSearchCV to the training data
grid_search.fit(X_train, y_train)

# Step 5: Get the best parameters and best model
print("Best Hyperparameters:", grid_search.best_params_)

# Step 6: Evaluate the best model on the validation set
best_model = grid_search.best_estimator_

# Predict on validation set
y_pred = best_model.predict(X_val)

# Evaluate the model
print("Accuracy:", accuracy_score(y_val, y_pred))  # Accuracy score
print("Classification Report:\n", classification_report(y_val, y_pred))  # Precision, recall, F1-score
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))  # Confusion matrix


Fitting 5 folds for each of 144 candidates, totalling 720 fits
Best Hyperparameters: {'C': 100, 'degree': 3, 'gamma': 0.01, 'kernel': 'rbf'}
Accuracy: 0.8552200800291014
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.93      0.91      4135
           1       0.75      0.62      0.68      1363

    accuracy                           0.86      5498
   macro avg       0.82      0.78      0.79      5498
weighted avg       0.85      0.86      0.85      5498

Confusion Matrix:
 [[3854  281]
 [ 515  848]]


# HyperParameter Tuning - One vs One Approach

In [90]:
from sklearn.svm import SVC
from sklearn.multiclass import OneVsOneClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Step 1: Split the data into training and validation sets
X = zNormalisedDF.drop(columns=['salary_>50K']) 
y = zNormalisedDF['salary_>50K'] 

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Define the parameter grid
param_grid = {
    'estimator__C': [0.1, 1, 10, 100],  # Regularization parameter
    'estimator__kernel': ['linear', 'rbf', 'poly'],  # Kernel types
    'estimator__gamma': ['scale', 'auto', 0.1, 0.01],  # Kernel coefficient for 'rbf', 'poly' kernels
    'estimator__degree': [3, 4, 5]  # Degree of the polynomial kernel (only used for 'poly')
}

# Step 3: Initialize the OneVsOneClassifier with SVC as the base estimator
ovo_classifier = OneVsOneClassifier(SVC())

# Step 4: Initialize GridSearchCV with OneVsOneClassifier
grid_search = GridSearchCV(estimator=ovo_classifier, param_grid=param_grid, scoring='accuracy', cv=5, verbose=1, n_jobs=-1)

# Step 5: Fit GridSearchCV to the training data
grid_search.fit(X_train, y_train)

# Step 6: Get the best parameters and best model
print("Best Hyperparameters:", grid_search.best_params_)

# Step 7: Evaluate the best model on the validation set
best_model = grid_search.best_estimator_

# Predict on validation set
y_pred = best_model.predict(X_val)

# Evaluate the model
print("Accuracy:", accuracy_score(y_val, y_pred))  # Accuracy score
print("Classification Report:\n", classification_report(y_val, y_pred))  # Precision, recall, F1-score
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))  # Confusion matrix


Fitting 5 folds for each of 144 candidates, totalling 720 fits
Best Hyperparameters: {'estimator__C': 100, 'estimator__degree': 3, 'estimator__gamma': 0.01, 'estimator__kernel': 'rbf'}
Accuracy: 0.8552200800291014
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.93      0.91      4135
           1       0.75      0.62      0.68      1363

    accuracy                           0.86      5498
   macro avg       0.82      0.78      0.79      5498
weighted avg       0.85      0.86      0.85      5498

Confusion Matrix:
 [[3854  281]
 [ 515  848]]


# Saving Best Model

In [None]:
model_name = 'best_svm_OvO_Salary_Classification'

# Save the best model using pickle
with open('models/'+model_name+'.pkl', 'wb') as f:
    pickle.dump(best_model, f)


# Process Unseen Data

**NOTE: This might need to be changed if more pre-prcessing is done, especially if certain features such as race are combined into one and so on**

Defining the columns the user needs to enter and any validation checks required

In [2]:
# This create a dictionary mapping the 'education' column to 'education-num' column using the noCountryDF
# education_mapping = noCountryDF[['education', 'education-num']].drop_duplicates().sort_values(by='education-num')
# education_dict = dict(zip(education_mapping['education'], education_mapping['education-num']))

# This create a dictionary mapping the 'education' column to 'education-num' column using the specified string 
# (string was derived from the noCountryDF however its done manually here to avoid having to run all the prior code)
education_mapping = "{'Preschool': 1, '1st-4th': 2, '5th-6th': 3, '7th-8th': 4, '9th': 5, '10th': 6, '11th': 7, '12th': 8, 'HS-grad': 9, 'Some-college': 10, 'Assoc-voc': 11, 'Assoc-acdm': 12, 'Bachelors': 13, 'Masters': 14, 'Prof-school': 15, 'Doctorate': 16}"
education_dict = ast.literal_eval(education_mapping)

In [3]:
# Define validation functions for each column
def validate_age(value):
    # Check if the value is an integer and between 0 and 100
    return isinstance(value, int) and 0 <= value <= 100

def validate_workclass(value):
    valid_workclasses = ['State-gov', 'Self-emp-not-inc', 'Private', 'Federal-gov', 'Local-gov', 'Self-emp-inc', 'Without-pay'] # 'Never-worked' didnt show up for me
    return value in valid_workclasses

def validate_education(value):
    valid_education = ['Bachelors', 'HS-grad', '11th', 'Masters', 'Some-college', 'Assoc-acdm', 'Doctorate', '9th', 'Assoc-voc', '7th-8th', 'Prof-school', '10th', 'Preschool', '5th-6th', '12th', '1st-4th']
    return value in valid_education

def validate_marital_status(value):
    valid_statuses = ['Never-married', 'Married-civ-spouse', 'Divorced', 'Separated', 'Married-AF-spouse', 'Widowed', 'Married-spouse-absent']
    return value in valid_statuses

def validate_occupation(value):
    valid_occupations = ['Adm-clerical', 'Exec-managerial', 'Handlers-cleaners','Prof-specialty', 'Sales', 'Farming-fishing', 'Machine-op-inspct','Other-service', 'Transport-moving', 'Tech-support','Craft-repair', 'Protective-serv', 'Armed-Forces','Priv-house-serv']
    return value in valid_occupations

def validate_relationship(value):
    valid_relationships = ['Not-in-family', 'Husband', 'Wife', 'Own-child', 'Unmarried', 'Other-relative']
    return value in valid_relationships

def validate_race(value):
    valid_races = ['White', 'Black', 'Other', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo']
    return value in valid_races

def validate_sex(value):
    return value in ['Male', 'Female']

def validate_capital_gain(value):
    return  isinstance(value, int) and 0 <= value <= 99999  # Capital gain range

def validate_capital_loss(value):
    return  isinstance(value, int) and 0 <= value <= 4356  # Capital loss range

def validate_hours_per_week(value):
    return  isinstance(value, int) and 20 <= value <= 60  # Hours per week range

def validate_salary(value):
    return value in ['<=50K', '>50K']


# Function to check if a row is valid
def validate_row(row, validation_functions):
    for col, value in row.items():
        if not validation_functions[col](value):
            return False  # Invalid row if any column doesn't meet the criteria
    return True

# Function to append data if valid
def append_data(df, new_data, validation_functions):
    # Check if new data is valid
    if validate_row(new_data, validation_functions):
        display(df)

        new_row_df = pd.DataFrame([new_data])

        # If valid, append to the dataframe
        df = pd.concat([df, new_row_df], ignore_index=True)
        print("Data appended successfully!")
    else:
        print("Invalid data. Data was not appended.")
    return df

Adding new unseen data

In [4]:
# Define the columns and data types as per the required format
columns = ['age', 'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'salary'] # Features
label = 'salary' # Label to predict

# Remove the label from the feature list and create an empty DataFrame with the specified columns
feature_list = [col for col in columns if col != label]
df = pd.DataFrame(columns=feature_list)

# Combine all validation functions in a dictionary
validation_functions = {
    'age': validate_age,
    'workclass': validate_workclass,
    'education': validate_education,
    'marital-status': validate_marital_status,
    'occupation': validate_occupation,
    'relationship': validate_relationship,
    'race': validate_race,
    'sex': validate_sex,
    'capital-gain': validate_capital_gain,
    'capital-loss': validate_capital_loss,
    'hours-per-week': validate_hours_per_week,
    'salary': validate_salary
}
# Remove the validation function for the label from the dictionary
validation_functions.pop(label, None)

# Example from the training data where salary <=50K
# new_data = {
#     'age': 30,
#     'workclass': 'State-gov',
#     'education': 'Bachelors',
#     'marital-status': 'Never-married',
#     'occupation': 'Adm-clerical',
#     'relationship': 'Not-in-family',
#     'race': 'White',
#     'sex': 'Male',
#     'capital-gain': 2174,
#     'capital-loss': 0,
#     'hours-per-week': 40,
# }

# Example from the training data where salary >50K
new_data = {
    'age': 31,
    'workclass': 'Private',
    'education': 'Masters',
    'marital-status': 'Never-married',
    'occupation': 'Prof-specialty',
    'relationship': 'Not-in-family',
    'race': 'White',
    'sex': 'Female',
    'capital-gain': 14084,
    'capital-loss': 0,
    'hours-per-week': 50,
}

# Try appending the data
df = append_data(df, new_data, validation_functions)

# Mapping the edcuational values to their corresponding numerical values
if label != 'education':
    # Replace the 'education' column with its corresponding 'education-num' values
    df['education'] = df['education'].map(education_dict)
    df = df.rename(columns={'education': 'education-num'})

# Display the resulting DataFrame
display(df)

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week


Data appended successfully!


Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week
0,31,Private,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50


Process the data prior to making prediction

In [5]:
# List of the columns present in dataframe used to train the model
columns = ['age', 'education-num', 'sex', 'capital-gain', 'capital-loss',
           'hours-per-week', 'workclass_Local-gov', 'workclass_Private',
           'workclass_Self-emp-inc', 'workclass_Self-emp-not-inc',
           'workclass_State-gov', 'workclass_Without-pay',
           'marital-status_Married-AF-spouse', 'marital-status_Married-civ-spouse',
           'marital-status_Married-spouse-absent', 'marital-status_Never-married',
           'marital-status_Separated', 'marital-status_Widowed',
           'occupation_Armed-Forces', 'occupation_Craft-repair',
           'occupation_Exec-managerial', 'occupation_Farming-fishing',
           'occupation_Handlers-cleaners', 'occupation_Machine-op-inspct',
           'occupation_Other-service', 'occupation_Priv-house-serv',
           'occupation_Prof-specialty', 'occupation_Protective-serv',
           'occupation_Sales', 'occupation_Tech-support',
           'occupation_Transport-moving', 'relationship_Not-in-family',
           'relationship_Other-relative', 'relationship_Own-child',
           'relationship_Unmarried', 'relationship_Wife',
           'race_Asian-Pac-Islander', 'race_Black', 'race_Other', 'race_White',
           'salary_>50K']

# Create an empty DataFrame with these columns
formattedDF = pd.DataFrame(columns=columns)

# Drop columns that contain the label in their name (These are not required as this will be what is predicted)
formattedDF = formattedDF.loc[:, ~formattedDF.columns.str.contains(label)]


Mapping the data from the input DF to the formattedDF

In [6]:
# Copying over the continuous columns
formattedDF['age'] = df['age']
formattedDF['education-num'] = df['education-num']
formattedDF['capital-gain'] = df['capital-gain']
formattedDF['capital-loss'] = df['capital-loss']
formattedDF['hours-per-week'] = df['hours-per-week']
formattedDF['workclass_'+df['workclass']] = 1 
formattedDF['marital-status_'+df['marital-status']] = 1
formattedDF['occupation_'+df['occupation']] = 1
formattedDF['relationship_'+df['relationship']] = 1
formattedDF['race_'+df['race']] = 1
# formattedDF['salary_'+df['salary']] = 1 # This needs to be changed to work like sex when salary is changed according to the the TO-DO list
formattedDF['sex'] = formattedDF['sex'].apply(lambda x: 1 if x == 'Male' else 0)

# Fill remaining columns with 0
formattedDF.fillna(0, inplace=True)

formattedDF = formattedDF.astype(int)

# Keep only the columns present in the training data (removing columns that were dropped during one-hot encoding due to drop_first = True)
formattedDF = formattedDF[formattedDF.columns.intersection(columns)]

display(formattedDF)

  formattedDF.fillna(0, inplace=True)


Unnamed: 0,age,education-num,sex,capital-gain,capital-loss,hours-per-week,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,...,occupation_Transport-moving,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Asian-Pac-Islander,race_Black,race_Other,race_White
0,31,14,0,14084,0,50,0,1,0,0,...,0,1,0,0,0,0,0,0,0,1


Applying log1p to the skewed models identified during training

In [7]:
# If the data / model changes these need to be updated
high_skew_columns = ['capital-gain', 'capital-loss']

# Assuming 'high_skew_columns' from training is a list of columns with high skewness
for column in high_skew_columns:
    formattedDF[column] = np.log1p(formattedDF[column])

Applying the training scaler to the unseen data

In [8]:
# Loading the scaler and transform the data
with open('models/z-score_scaler_svm_Salary_Classification.pkl', 'rb') as f:
    scaler = pickle.load(f)

# List of continuous columns to standardize
continuous_columns = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

# Apply the scaler to the unseen data
formattedDF[continuous_columns] = scaler.transform(formattedDF[continuous_columns])

# Make prediction on new data using Best Model

In [9]:
# Load the saved model using pickle
with open('models/best_svm_OvM_Salary_Classification.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

# Make predictions with the loaded model
prediction = loaded_model.predict(formattedDF)

salary_result = '<=50K' if prediction[0] == 0 else '>50K'

print("Predicted Salary Class:", salary_result)

Predicted Salary Class: >50K
