# Classification Model

Dataset: Adult <br>
Obtained from: UCI Repository <br>
Extraction was done by Barry Becker from the 1994 Census database.

#### Parameters
- age: the age of an individual
- workclass: a general term to represent the employment status of an individual
- fnlwgt: final weight. This is the number of people the census believes the entry represents..
- education: the highest level of education achieved by an individual.
- education­num: the highest level of education achieved in numerical form.
- marital­status: marital status of an individual.
- occupation: the general type of occupation of an individual
- relationship: represents what this individual is relative to others.
- race: Descriptions of an individual’s race
- sex: the sex of the individual
- capital­gain: capital gains for an individual
- capital­loss: capital loss for an individual
- hours­per­week: the hours an individual has reported to work per week
- native­country: country of origin for an individual

In [None]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots


#For library function for creating Pipeline and scaling data
from sklearn.preprocessing import  StandardScaler  , LabelEncoder  
from sklearn.pipeline import Pipeline

# To split the data and evaluating the perfomance of the model
from sklearn.model_selection import train_test_split,   cross_validate

from sklearn.decomposition import PCA
#The models used are RandomForestClassifer and LogisticRegression and Catbboost
from sklearn.tree import DecisionTreeClassifier

#Scoring methods used to evaluate the perfomance of the model
from sklearn.metrics import f1_score 

from statsmodels.stats.outliers_influence import variance_inflation_factor 

from sklearn.svm import SVC

#For Resampling data
from imblearn.over_sampling import SMOTENC
from imblearn.under_sampling import NearMiss
from imblearn.combine import SMOTEENN , SMOTETomek


In [None]:
# Set seaborn plotting style and context
sns.set_theme(style='darkgrid')
sns.set_context("paper")

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# fetch dataset from UCI database
adult = fetch_ucirepo(id=2)

# #Read the original dataset into a pandas dataframe
df = adult.data.original

In [None]:
# Displaying information about the DataFrame
df.info()

In [None]:
# Setting the float format for display purposes
pd.options.display.float_format = '{:.3f}'.format

# Displaying descriptive statistics of the DataFrame
df.describe()

## Viewing any irregular values within the data

In [None]:
# Extracting columns with object data type
object_cols = df.select_dtypes('object').columns.to_list()

# Displaying unique values in each object column
for col in object_cols:
    print(f'Unique values in {col} :\n {df[col].unique()} \n ')

There are a few issues here after observing the values in each feature & label <br>

1. Our label has duplicated values as '<=50K' and '<=50K.' are supposed to be one and the same as well as '>50K.' & >50K'

2. As we can see from the following columns as values '?' which should be better handled as NULL values :<br> 
    -  workclass 
    - occupation 
    - native-country

#### Correcting label values

In [None]:
# Replacing income labels with binary values
df.replace({'>50K': 1, '>50K.': 1, '<=50K': 0, '<=50K.': 0}, inplace=True)

In [None]:
#Convert all values from ? to NULL values
df.replace('?' , np.nan ,inplace= True)

In [None]:
# Imputing missing values with most-frequent values
values = {'workclass': 'Private', 'occupation': 'Prof-specialty', 'native-country': 'United-States'}
df.fillna(value=values, inplace=True)

In [None]:
# Dropping the 'education' column
df.drop(['education' , 'fnlwgt'], axis=1, inplace=True)

In [None]:
# Creating a copy of the DataFrame
copy = df.copy()

# Extracting categorical columns for label encoding
cat = copy.select_dtypes('object').columns.to_list()

In [None]:
# Instantiating a LabelEncoder
le = LabelEncoder()

# Encoding labels in categorical columns
for col in cat:
    copy[col] = le.fit_transform(copy[col])

In [None]:
# Function to calculate Variance Inflation Factor (VIF)
def vif(dataframe):
    vif_data = pd.DataFrame() 
    vif_data["feature"] = dataframe.columns 
    
    # Calculating VIF for each feature
    vif_data["VIF"] = [variance_inflation_factor(dataframe.values, i) 
                            for i in range(len(dataframe.columns))] 
    return vif_data

# Calculating VIF for the DataFrame
vif(copy.drop('income' , axis = 1))

In [None]:
# Function to get numerical and categorical columns
def getColumnType(dataframe):
    num_cols = dataframe.select_dtypes('number').columns.to_list()
    cat_cols = dataframe.select_dtypes('object').columns.to_list()
    return(num_cols, cat_cols)

# Getting numerical and categorical columns
num_cols, cat_cols = getColumnType(df)

In [None]:
# Calculating correlation matrix
corr = df[num_cols].corr()

# Plotting the heatmap of correlation matrix
sns.heatmap(corr, annot=True, mask=np.triu(corr))
plt.show()

In [None]:
# Plotting Kernel Density Estimates for numerical features
fig, ax = plt.subplots(nrows=2, ncols=3)
plt.suptitle('Kernel Density Estimates for Various Features', fontsize=16)

for i, col in enumerate(num_cols, 0):
    plt.subplot(2, 3, i + 1)
    sns.kdeplot(df[col], color='orange', fill=True)

ax[1,2].set_visible(False)
plt.tight_layout()
plt.show()

In [None]:
# Plotting class balance visualization
ax = sns.countplot(x='income', data=df, hue='income', palette='pastel', stat='percent')
ax.xaxis.set_ticks(ax.get_xticks())
ax.set_xticklabels(['<=50K', ">50K"])
ax.legend(labels=['<=50K', ">50K"])
ax.set_title('Class Balance Visualization')
plt.show()

### Creating baseline models

Algorithms used: 

1) Decision Tree
2) KNN

In [None]:
# Preparing features and target variables
features = df.iloc[:, :-1]
target = df['income']

In [None]:
# One-hot encoding categorical features
features = pd.get_dummies(features, columns=cat_cols)


In [None]:
# Splitting the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)


In [None]:
# Creating an instance of DecisionTreeClassifier and fitting the model
dt = DecisionTreeClassifier(random_state=42)
dt.fit(x_train, y_train)

# Making predictions without feature scaling and calculating F1 score
wo_scale_pred = dt.predict(x_test)
f1_score(y_test, wo_scale_pred, average='weighted')



In [None]:
# Standardizing the features using StandardScaler
ss = StandardScaler()
x_train_scale = ss.fit_transform(x_train)
x_test_scale = ss.fit_transform(x_test)


In [None]:
# Fitting the DecisionTreeClassifier model on standardized features and calculating F1 score
dt = DecisionTreeClassifier(random_state=42)
dt.fit(x_train_scale, y_train)
wo_scale_pred = dt.predict(x_test_scale)
f1_score(y_test, wo_scale_pred, average='weighted')

In [None]:
# Determining the alphas for cost complexity pruning
path = dt.cost_complexity_pruning_path(x_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities


In [None]:
len(ccp_alphas)

In [None]:
# Creating DecisionTreeClassifier models with different ccp alphas
clfs = []
for ccp_alpha in ccp_alphas[100:]:
    clf = DecisionTreeClassifier(random_state=42, ccp_alpha=ccp_alpha , class_weight='balanced')
    clf.fit(x_train, y_train)
    clfs.append(clf)

In [None]:
# Making predictions on training and testing sets using different models
train_pred = [clf.predict(x_train) for clf in clfs]
test_pred = [clf.predict(x_test) for clf in clfs]

# Calculating F1 scores for each model
train_scores = [f1_score(y_train, pred, average='weighted') for pred in train_pred]
test_scores = [f1_score(y_test, pred, average='weighted') for pred in test_pred]

In [None]:
fig = go.Figure()

# Add training data trace
fig.add_trace(go.Scatter(x=ccp_alphas[100:2200], y=train_scores, name='Train'))

# Add testing data trace
fig.add_trace(go.Scatter(x=ccp_alphas[100:2200], y=test_scores, name='Test'))
fig.update_layout(xaxis_tickformat=".5f")
# Update layout
fig.update_layout(
    xaxis_title="alpha",
    yaxis_title="accuracy",
    title="Weighted F1 vs alpha for training and testing sets"
)

fig.add_annotation(x=0.000145,y=0.826 ,
            text="alpha = 0.000145",
            showarrow=True,
            font_size = 13)

        

# Show the plot
fig.show()

In [None]:
dt = DecisionTreeClassifier(random_state=42, ccp_alpha=0.000145)

# Fitting the model on the training data
dt.fit(x_train, y_train)

# Making predictions on the test data
test_pred = dt.predict(x_test)
train_pred = dt.predict(x_train)

In [None]:
# Function to print the F1 scores for the training and test sets
def print_scores(actual_train, actual_test, train_predict, test_pred):
    train_score = f1_score(actual_train, train_predict, average='weighted')
    test_score = f1_score(actual_test, test_pred, average='weighted')
    print("Train F1 Score:", train_score)
    print("Test F1 Score:", test_score)


In [None]:
# Printing the F1 scores for the training and test sets
print_scores(y_train, y_test, train_pred, test_pred)

## How does different techniques affect our imbalaned datasets

In [None]:
# Split the data into training and test sets
features = df.iloc[:,:-1]
target = df['income']
features = pd.get_dummies(features)
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)

In [None]:
def svc_model(xtrain, ytrain, xtest, ytest):
    # Create a pipeline including standard scaling and SVC with balanced class weights
    pipeline = Pipeline([('scaling', StandardScaler()), ('svc', SVC(class_weight='balanced'))])
    
    # Fit the pipeline on the training data
    pipeline.fit(xtrain, ytrain)
    
    # Predict on the test data
    baseline = pipeline.predict(xtest)
    
    # Calculate and return the weighted F1 score
    return f1_score(ytest, baseline, average='weighted')

#### using class weights

In [None]:
# Call the svc_model function and print the F1 score
svc_model(x_train, y_train, x_test, y_test)

### Oversampling 
 - SMOTE

In [None]:
# Extracting features from the DataFrame except for the last column
features = df.iloc[:, :-1]

# Extracting the target variable from the DataFrame
target = df['income']

In [None]:
# Initializing SMOTENC oversampling technique with specified categorical features and random state
smotenc = SMOTENC(categorical_features=[1, 3, 4, 5, 6, 7, 11], random_state=42)

# Fitting the predictor and target variables to generate synthetic samples using SMOTENC
x_smote, y_smote = smotenc.fit_resample(features, target)

# Performing one-hot encoding on the resampled features
x_smote = pd.get_dummies(x_smote)

In [None]:
# Splitting the resampled data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x_smote, y_smote, test_size=0.3, random_state=42)

# Calling the svc_model function to train and evaluate the SVC model
svc_model(x_train, y_train, x_test, y_test)

### UnderSampling
- NearMiss

In [None]:
# Initializing NearMiss undersampling technique with specified number of neighbors
nm = NearMiss(n_neighbors=3)

# Performing one-hot encoding on the features
features = pd.get_dummies(features)

# Fitting the NearMiss undersampling technique to the data to balance the target variable
x_nm, y_nm = nm.fit_resample(features, target)

# Splitting the resampled data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x_nm, y_nm, test_size=0.3, random_state=42)

# Calling the svc_model function to train and evaluate the SVC model
svc_model(x_train, y_train, x_test, y_test)

### Combined OverSampling and UnderSampling

In [None]:
# Extracting features and target variable from the DataFrame
features = df.iloc[:, :-1]
target = df['income']

# Performing one-hot encoding on categorical columns
features = pd.get_dummies(features, columns=cat_cols)

In [None]:
# Initializing SMOTEENN and SMOTETomek resampling techniques
sme = SMOTEENN(random_state=42)
smt = SMOTETomek(random_state=42)


In [None]:
# Resampling the data using SMOTEENN and SMOTETomek techniques
x_sme, y_sme = sme.fit_resample(features, target)
x_smt, y_smt = smt.fit_resample(features, target)


In [None]:
# Splitting the resampled data into training and test sets for SMOTEENN
x_train_sme, x_test_sme, y_train_sme, y_test_sme = train_test_split(x_sme, y_sme, test_size=0.3, random_state=42)

# Splitting the resampled data into training and test sets for SMOTETomek
x_train_smt, x_test_smt, y_train_smt, y_test_smt = train_test_split(x_smt, y_smt, test_size=0.3, random_state=42)

In [None]:
# Calling the svc_model function to train and evaluate the SVC model on SMOTEENN resampled data
svc_model(x_train_sme, y_train_sme, x_test_sme, y_test_sme)

In [None]:
# Calling the svc_model function to train and evaluate the SVC model on SMOTETomek resampled data
svc_model(x_train_smt, y_train_smt, x_test_smt, y_test_smt)

In [None]:
def visulise_noise(X1, X2, y1, y2):
    # Fit PCA pipeline
    pipeline = Pipeline([('scaling', StandardScaler()), ('pca', PCA(n_components=3))])
    features_pca1 = pipeline.fit_transform(X1)
    features_pca2 = pipeline.transform(X2)

    # Create DataFrames for visualization
    pc1 = pd.DataFrame(features_pca1, columns=['pc1', 'pc2', 'pc3'])
    pc2 = pd.DataFrame(features_pca2, columns=['pc1', 'pc2', 'pc3'])

    pc1['targets'] = y1
    pc2['targets'] = y2

    # Create subplots
    fig = make_subplots(rows=1, cols=2, specs=[[{'type': 'scatter3d'}, {'type': 'scatter3d'}]])

    # Add 3D scatterplots to subplots
    fig.add_annotation(text=f"Baseline", xref="paper", yref="paper", x=0.10, y=0.97, showarrow=False)
    fig.add_trace(go.Scatter3d(x=pc1['pc1'], y=pc1['pc2'], z=pc1['pc3'], marker=dict(color=pc1['targets']), mode='markers', showlegend=False), row=1, col=1)

    fig.add_annotation(text=f"SMOTEENN", xref="paper", yref="paper", x=0.65, y=0.97, showarrow=False)
    fig.add_trace(go.Scatter3d(x=pc2['pc1'], y=pc2['pc2'], z=pc2['pc3'], marker=dict(color=pc2['targets']), mode='markers', showlegend=False), row=1, col=2)

    fig.show()

In [None]:
visulise_noise(features , x_sme , target , y_sme)