<a href="https://colab.research.google.com/github/KWLee1999/kNN-LogReg-prediction/blob/main/ML_IncomeGroup.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import libraries

In [2]:
!pip install imbalanced-learn
!pip install yellowbrick

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.14.0-py3-none-any.whl.metadata (8.8 kB)
Downloading imbalanced_learn-0.14.0-py3-none-any.whl (239 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m240.0/240.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.14.0
Collecting yellowbrick
  Downloading yellowbrick-1.5-py3-none-any.whl.metadata (7.7 kB)
Downloading yellowbrick-1.5-py3-none-any.whl (282 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m282.6/282.6 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: yellowbrick
Successfully installed yellowbrick-1.5


In [7]:
# Import General library
import numpy as np
import pandas as pd
import math as mt
import matplotlib.pyplot as plt

# Import Data Processing library
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, cross_val_score

# Import Learning Models library
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Import Evaluation Metriccs library
from sklearn.metrics import classification_report, roc_curve, auc, precision_recall_curve, average_precision_score, accuracy_score
from yellowbrick.classifier import ConfusionMatrix
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


Define basic functions:

In [8]:
def exp_data_analysis(df, cols_format, unique_classes):
    for label in df.columns[:-1]:
        if cols_format[label] in ['binary','cont']:
            for val in unique_classes:
                plt.hist(df[df["Income-Class"]==val][label],label=val,alpha=0.7)
            plt.ylabel("No of Adults")
            plt.xlabel(label)
            plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))

        if cols_format[label] not in ['binary','cont']:
           df[label].value_counts().plot(kind="bar", figsize=(10,5))
           plt.title(f"{str(label)} distribution")
           plt.xlabel(label)
           plt.ylabel("Count")

        plt.show()

def one_hot_encoder(df, df_ohe, col_names):
    encoder = OneHotEncoder()
    encoded = encoder.fit_transform(df_ohe)

    encoded_df = pd.DataFrame(
    encoded.toarray(),
    columns=encoder.get_feature_names_out(col_names)
    )

    df_combined = pd.concat([df,encoded_df],axis=1)

    return df_combined

def confusion_matrix(model, unique_classes, X_train, y_train, X_test, y_test):
    cm = ConfusionMatrix(model, classes=unique_classes)
    cm.fit(X_train, y_train)
    cm.score(X_test, y_test)

    cm.poof()

def class_to_int(y_set):
    int_y_test = y_set.copy()
    int_y_test[int_y_test == ' <=50K'] = '0'
    int_y_test[int_y_test == ' >50K'] = '1'
    int_y_test = int_y_test.astype('int32')

    return int_y_test

def roc_auc_graph(model, unique_classes, X_test, y_test,k,distance):
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    n_classes = len(unique_classes)

    roc_y_test = class_to_int(y_test)

    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve((roc_y_test == i).astype(int), model.predict_proba(X_test)[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])


    # Plot ROC curves
    plt.figure(figsize=(6, 6))
    colors = ['blue', 'red']
    for i, color in enumerate(colors):
        tempclass = " <=50K" if i == 0 else "> 50K"
        plt.plot(fpr[i], tpr[i], color=color, lw=2, label=f'ROC curve (class {tempclass}) (area = {roc_auc[i]:.4f})')
        if k != "":
          plt.annotate(f'K={k}', xy=(0.8, 0.2), xytext=(0.8, 0.2))
          plt.annotate(f'{distance}', xy=(0.8, 0.16), xytext=(0.8, 0.16))

    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")

    # Display AUC values
    for i in range(n_classes):
        print(f"AUC for class {i}: {roc_auc[i]:.4f}")

    plt.show()

def scale_dataset(dataframe, oversample=False):
  # Extract features from the dataframe: all columns except the last one
  X = dataframe[dataframe.columns[:-1]].values
  # Extract target variable from the dataframe: the last column
  y = dataframe[dataframe.columns[-1]].values
  # Initialize a standard scaler object
  scaler = StandardScaler()
  # Fit the scaler to the data and transform the features to have mean=0 and std deviation=1
  X = scaler.fit_transform(X)

  # Check if oversampling is required
  if oversample:
      # Initialize a random oversampler object for handling class imbalance
      ros = RandomOverSampler()
      # Use the oversampler to balance class distribution by duplicating some minority class samples
      X, y = ros.fit_resample(X, y)

  # Combine the standardized (and optionally oversampled) features and target into one array
  data = np.hstack((X, np.reshape(y, (-1, 1))))

  # Return the combined data, the features, and the target variable
  return data, X, y

def model(x):
      return 1 / (1 + np.exp(-x))



Get the data ready for processing:

In [9]:
filename = '/content/drive/MyDrive/adult.data' # update the path to the file
cols = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','Income-Class']

df = pd.read_csv(filename, names=cols) # add the column name to the dataframe (panda)

df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

df.head()

#exp_data_analysis(df, cols_format, unique_classes)

col_names = ['workclass','marital-status','occupation','relationship','race']
df_ohe = df[['workclass','marital-status','occupation','relationship','race']]

ds = one_hot_encoder(df, df_ohe, col_names)
ds.columns = ds.columns.str.replace('-','_')

cols = [col for col in ds.columns if col != 'Income_Class'] + ['Income_Class']
ds = ds[cols]

#Data Processing
#1. Data quality issue. Remove the "." from Income Class.
ds['Income_Class'] = ds['Income_Class'].replace({"<=50K.": 0, ">50K.": 1,"<=50K": 0, ">50K": 1})

#2. Turn the sex into binary field
ds['sex'] = ds['sex'].replace({" Male": 0, " Female": 1})

#3. Drop the catagorical field that cannot be converted into binary and perform a proper labeling
ds = ds.select_dtypes(exclude='object')

unique_classes = ds['Income_Class'].unique()

ds.head()

  ds['Income_Class'] = ds['Income_Class'].replace({"<=50K.": 0, ">50K.": 1,"<=50K": 0, ">50K": 1})


Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,workclass_?,workclass_Federal_gov,workclass_Local_gov,workclass_Never_worked,...,relationship_Other_relative,relationship_Own_child,relationship_Unmarried,relationship_Wife,race_Amer_Indian_Eskimo,race_Asian_Pac_Islander,race_Black,race_Other,race_White,Income_Class
0,39,77516,13,2174,0,40,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
1,50,83311,13,0,0,13,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
2,38,215646,9,0,0,40,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
3,53,234721,7,0,0,40,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
4,28,338409,13,0,0,40,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0


Split the data into Training, validating and testing sets.

Over-sampling using SMOTE method:
It works by interpolating between an existing minority class sample and some of its nearest neighbors to create new, synthetic data points.

In [10]:
sqrt_n = mt.floor(mt.sqrt(len(ds))) - 0 if mt.sqrt(len(ds))%2 > 0 else 1

# Split the dataset using ratio according to rule of thumb
# Split the data into training (80%) and the combined validation and test set
train, temp = train_test_split(ds, train_size=0.8, random_state=42)
# Split the combined validation (10%) and test set (10%)
valid, test = train_test_split(temp, test_size=0.5, random_state=42)

#Do scaling for the train data only
train, X_train, y_train = scale_dataset(train, oversample=False)
valid, X_valid, y_valid = scale_dataset(valid, oversample=False)
test, X_test, y_test = scale_dataset(test, oversample=False)

# Assuming X_train and y_train are your training features and labels
# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to the training data
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print("Shape of training data before SMOTE:", X_train.shape)
print("Shape of training data after SMOTE:", X_train_smote.shape)
print("Class distribution before SMOTE:", pd.Series(y_train).value_counts())
print("Class distribution after SMOTE:", pd.Series(y_train_smote).value_counts())

Shape of training data before SMOTE: (39073, 48)
Shape of training data after SMOTE: (59482, 48)
Class distribution before SMOTE: 0    29741
1     9332
Name: count, dtype: int64
Class distribution after SMOTE: 0    29741
1    29741
Name: count, dtype: int64


In [None]:
def knn_model(X_train, y_train, X_valid, y_valid, X_test, y_test, sqrt_n, k_values, distances):

  for i, k in enumerate(k_values, 1): # return k in: [(1, 1), (2, 5), (3, 20)] #Highest AUC

      for distance in distances:
        # Create a KNN classifier with the current k value
        knn = KNeighborsClassifier(n_neighbors=k,metric=distance, p=3) # Help to test manhattan / euclidean
        print("----------------------------------K="f"{k} ------------------------------------------")
        print(f'****************{distance}***********************')
        knn.fit(X_train, y_train)

        # Valiadation data
        cv_scores = cross_val_score(knn, X_train, y_train, cv=5)

        # Evaluate the model on the validation set
        # Calculate and display the accuracy on the test set
        y_valid_pred = knn.predict(X_valid)
        valid_accuracy = accuracy_score(y_valid, y_valid_pred)
        print(f"Validation Set Accuracy:{valid_accuracy:.2f}")

        # Evaluate the model on the test set
        # Calculate and display the accuracy on the test set
        y_test_pred = knn.predict(X_test)
        test_accuracy = accuracy_score(y_test, y_test_pred)
        print(f"Test Set Accuracy: {test_accuracy:.4f}")

        # Print cross-validation scores
        print("Cross-Validation Scores:", cv_scores)
        print(f"Mean Cross-Validation Accuracy: {cv_scores.mean():.4f}" )

        # Classification Report for Validation Set
        print("Classification Report for Validation Set:")
        print(classification_report(y_valid, y_valid_pred))

        # Classification Report for Test Set
        print("Classification Report for Test Set:")
        print(classification_report(y_test, y_test_pred))

        #Confusion Matrix
        #==========================================================================
        print(f'****************Cufusion matrix of {k} for Validation Set***********************')
        confusion_matrix(knn,unique_classes,X_train, y_train,X_test, y_test)

        #Compute ROC curve and ROC area for each class
        #==========================================================================
        print(f'****************AUC of {k} for Validation Set***********************')
        roc_auc_graph(knn,unique_classes, X_valid, y_valid,k,distance)

        #Compute ROC curve and ROC area for each class
        #==========================================================================
        print(f'****************AUC of {k} for Testing Set***********************')
        roc_auc_graph(knn,unique_classes, X_test, y_test,k,distance)

def logreg_model(X_train, y_train, X_valid, y_valid, X_test, y_test):
    # instantiate the model
    logreg = LogisticRegression(solver='liblinear', random_state=0)

    # fit the model
    logreg.fit(X_train, y_train)

    cv_scores = cross_val_score(logreg, X_train, y_train, cv=5)

    y_test_pred = logreg.predict(X_test)

    # Print cross-validation scores
    print("Cross-Validation Scores:", cv_scores)
    print(f"Mean Cross-Validation Accuracy: {cv_scores.mean():.4f}" )

    y_valid_pred = logreg.predict(X_valid)
    valid_accuracy = accuracy_score(y_valid, y_valid_pred)

    print(f"Validation Set Accuracy:{valid_accuracy:.2f}")

    # Classification Report for Validation Set
    print("Classification Report for Validation Set:")
    print(classification_report(y_valid, y_valid_pred))

    print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_test_pred)))

    # Classification Report for Test Set
    print("Classification Report for Test Set:")
    print(classification_report(y_test, y_test_pred))

    #Confusion Matrix
    #==========================================================================
    print(f'****************Cufusion matrix of Logistic Regression for Validation Set***********************')
    confusion_matrix(logreg, unique_classes, X_train, y_train, X_test, y_test)

    #Compute ROC curve and ROC area for each class
    #==========================================================================
    print(f'****************AUC of Logistic Regression for Validation Set***********************')
    roc_auc_graph(logreg, unique_classes, X_valid, y_valid,"","")

    #Compute ROC curve and ROC area for each class
    #==========================================================================
    print(f'****************AUC of Logistic Regression for Testing Set***********************')
    roc_auc_graph(logreg, unique_classes, X_test, y_test,"","")

def svm_model(X_train, y_train, X_valid, y_valid, X_test, y_test, krn, reg, gmma):
    scaler = StandardScaler()

    # Fit the scaler on training data and transform it.
    X_train = scaler.fit_transform(X_train)

    # Transform the test data using the same scaler. It's important not to fit again to avoid data leakage.
    X_test = scaler.transform(X_test)

    # SVM Classifier: Support Vector Machine with a linear kernel.
    svm_model = SVC(kernel=krn, C=reg, gamma = gmma)
    svm_model.fit(X_train, y_train)
    svm_predictions = svm_model.predict(X_test)

    # Calculate the accuracy of the SVM model's predictions.
    svm_accuracy = accuracy_score(y_test, svm_predictions)

    # Generate a detailed classification report showing performance metrics for the SVM.
    svm_classification_report = classification_report(y_test, svm_predictions)

    # Display the SVM's accuracy ,classification report and confusion matrix.
    print("SVM Accuracy:", svm_accuracy)
    print("SVM Classification Report:\n", svm_classification_report)
    confusion_matrix(svm_model, unique_classes, X_train, y_train, X_test, y_test)

def rf_model(X_train, y_train, X_valid, y_valid, X_test, y_test, n_est, rd_state, min_leaf):
    scaler = StandardScaler()

    # Fit the scaler on training data and transform it.
    X_train = scaler.fit_transform(X_train)

    # Transform the test data using the same scaler. It's important not to fit again to avoid data leakage.
    X_test = scaler.transform(X_test)

    # Random Forest Classifier: An ensemble of decision trees.
    rf_model = RandomForestClassifier(n_estimators=n_est, criterion='entropy', random_state=rd_state, max_depth = None)
    rf_model.fit(X_train, y_train)
    rf_predictions = rf_model.predict(X_test)

    # Calculate the accuracy of the Random Forest model's predictions.
    rf_accuracy = accuracy_score(y_test, rf_predictions)

    # Generate a detailed classification report showing performance metrics for the Random Forest.
    rf_classification_report = classification_report(y_test, rf_predictions)

    # Display the Random Forest's accuracy, classification report and confusion matrix.
    print("Random Forest Accuracy:", rf_accuracy)
    print("Random Forest Classification Report:\n", rf_classification_report)
    confusion_matrix(rf_model, unique_classes, X_train, y_train, X_test, y_test)

logreg_model(X_train_smote, y_train_smote, X_valid, y_valid, X_test, y_test)
knn_model(X_train_smote, y_train_smote, X_valid, y_valid, X_test, y_test, k_values = [221], distinces = ["euclidean","manhattan","minkowski"])
rf_model(X_train_smote, y_train_smote, X_valid, y_valid, X_test, y_test, n_est = 500, rd_state = 42, min_leaf =10)
svm_model(X_train_smote, y_train_smote, X_valid, y_valid, X_test, y_test, krn = 'poly', reg = 0.1, gmma = 1)

In [4]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [10, 50, 100, 200, 500, 1000, 10000],
    'random_state': [42],
    'min_samples_leaf': [5, 10, 15]
}

# Initialize the Random Forest Classifier
rf = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='f1', n_jobs=-1)

# Fit GridSearchCV to the oversampled training data
grid_search.fit(X_train_smote, y_train_smote)

# Print the best parameters and the corresponding best score
print("Best parameters found: ", grid_search.best_params_)
print("Best F1 score found: ", grid_search.best_score_)

NameError: name 'X_train_smote' is not defined

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

# Define the parameter grid for SVM
param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': [1, 0.1, 0.01, 0.001],     # Kernel coefficient
}

# Initialize the SVM Classifier
svm = SVC()

# Initialize GridSearchCV
grid_search_svm = GridSearchCV(estimator=svm, param_grid=param_grid_svm, cv=3, scoring='f1', n_jobs=-1)

# Fit GridSearchCV to the oversampled training data
grid_search_svm.fit(X_train_smote, y_train_smote)

# Print the best parameters and the corresponding best score
print("Best parameters found for SVM: ", grid_search_svm.best_params_)
print("Best F1 score found for SVM: ", grid_search_svm.best_score_)