In [None]:
## Import dependencies

import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import ClusterCentroids
from imblearn.combine import SMOTEENN
from sklearn.preprocessing import StandardScaler


In [None]:
# Load the data
file_path = Path("")
df = pd.read_csv(file_path)

In [None]:
# Create our features, get_dummies for non-numerical columns
df_encoded = pd.get_dummies(df.drop('target column', axis=1))
X_cols = [i for i in df_encoded.columns if i not in ('target column')]
X = df_encoded[X_cols]

# Create our target
y = df['target column']

In [None]:
# Check counts, see if we need resampling
y.value_counts()

In [None]:
# Split 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
# Creating the scaler instance
scaler = StandardScaler()
# Fitting the scaler
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Resample the training data: Oversampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train_scaled, y_train)

In [None]:
# Resample the training data: SMOTE (Oversampler)
X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(
X_train_scaled, y_train)

In [None]:
# Resample the training data: ClusterCentroids (Undersampler)
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train_scaled, y_train)

In [None]:
# Resample the training data: SMOTEENN (combo)
smote_enn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smote_enn.fit_resample(X_train_scaled, y_train)

In [None]:
Counter(y_resampled)

In [None]:
# Train the Logistic Regression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)
y_pred = model.predict(X_test_scaled)

In [None]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))