In [1]:
# Importing necessary libraries
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_recall_curve, auc
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.combine import SMOTEENN
from collections import Counter

In [2]:
cd /content/drive/MyDrive/Machine learning 2024/Notebooks - Machine Learning/Generic Projects/lung cancer

/content/drive/MyDrive/Machine learning 2024/Notebooks - Machine Learning/Generic Projects/lung cancer


In [3]:
data = pd.read_csv("lung_cancer.csv")

In [4]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
data['LUNG_CANCER']=encoder.fit_transform(data['LUNG_CANCER'])
data['GENDER']=encoder.fit_transform(data['GENDER'])

In [5]:
X=data.drop('LUNG_CANCER',axis=1)
y=data['LUNG_CANCER']

In [6]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
Counter(y_train)

Counter({1: 210, 0: 37})

In [9]:
# Performing Logistic Regression on the original imbalanced dataset
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

# Evaluating the performance
print("Logistic Regression on Original Data:")
print(classification_report(y_test, y_pred_lr, digits=4))


Logistic Regression on Original Data:
              precision    recall  f1-score   support

           0     0.5000    0.5000    0.5000         2
           1     0.9833    0.9833    0.9833        60

    accuracy                         0.9677        62
   macro avg     0.7417    0.7417    0.7417        62
weighted avg     0.9677    0.9677    0.9677        62



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
# 1. Random Oversampling
ros = RandomOverSampler(random_state=42)
X_res_ros, y_res_ros = ros.fit_resample(X_train, y_train)
print(Counter(y_res_ros))

model_ros = LogisticRegression(random_state=42)
model_ros.fit(X_res_ros, y_res_ros)
y_pred_ros = model_ros.predict(X_test)
print("Random Oversampling:")
print(classification_report(y_test, y_pred_ros, digits=4))

Counter({1: 210, 0: 210})
Random Oversampling:
              precision    recall  f1-score   support

           0     0.1429    0.5000    0.2222         2
           1     0.9818    0.9000    0.9391        60

    accuracy                         0.8871        62
   macro avg     0.5623    0.7000    0.5807        62
weighted avg     0.9548    0.8871    0.9160        62



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
# 2. Random Undersampling
rus = RandomUnderSampler(random_state=42)
X_res_rus, y_res_rus = rus.fit_resample(X_train, y_train)
print(Counter(y_res_rus))
model_rus = LogisticRegression(random_state=42)
model_rus.fit(X_res_rus, y_res_rus)
y_pred_rus = model_rus.predict(X_test)
print("Random Undersampling:")
print(classification_report(y_test, y_pred_rus))

Counter({0: 37, 1: 37})
Random Undersampling:
              precision    recall  f1-score   support

           0       0.12      0.50      0.20         2
           1       0.98      0.88      0.93        60

    accuracy                           0.87        62
   macro avg       0.55      0.69      0.56        62
weighted avg       0.95      0.87      0.91        62



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
# 3. SMOTE (Synthetic Minority Over-sampling Technique)
smote = SMOTE(random_state=42)
X_res_smote, y_res_smote = smote.fit_resample(X_train, y_train)
print(Counter(y_res_smote))
model_smote = LogisticRegression(random_state=42)
model_smote.fit(X_res_smote, y_res_smote)
y_pred_smote = model_smote.predict(X_test)
print("SMOTE:")
print(classification_report(y_test, y_pred_smote, digits=4))

Counter({1: 210, 0: 210})
SMOTE:
              precision    recall  f1-score   support

           0     0.5000    0.5000    0.5000         2
           1     0.9833    0.9833    0.9833        60

    accuracy                         0.9677        62
   macro avg     0.7417    0.7417    0.7417        62
weighted avg     0.9677    0.9677    0.9677        62



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
# Applying SMOTEENN (SMOTE + Edited Nearest Neighbors)
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)

# Display resampled class distribution
print(f"Resampled class distribution: {Counter(y_resampled)}")

# Training a logistic regression model on the resampled dataset
model = LogisticRegression(random_state=42)
model.fit(X_resampled, y_resampled)
y_pred = model.predict(X_test)

# Evaluating the model
print("SMOTEENN Results:")
print(classification_report(y_test, y_pred, digits=4))

Resampled class distribution: Counter({0: 185, 1: 159})
SMOTEENN Results:
              precision    recall  f1-score   support

           0     0.3333    0.5000    0.4000         2
           1     0.9831    0.9667    0.9748        60

    accuracy                         0.9516        62
   macro avg     0.6582    0.7333    0.6874        62
weighted avg     0.9621    0.9516    0.9562        62



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
from imblearn.combine import SMOTETomek

smote_tomek = SMOTETomek(random_state=42)
X_resampled, y_resampled = smote_tomek.fit_resample(X_train, y_train)

# Display resampled class distribution
print(f"Resampled class distribution: {Counter(y_resampled)}")

# Training a logistic regression model on the resampled dataset
model = LogisticRegression(random_state=42)
model.fit(X_resampled, y_resampled)
y_pred = model.predict(X_test)

# Evaluating the model
print("SMOTETomek Results:")
print(classification_report(y_test, y_pred, digits=4))

Resampled class distribution: Counter({1: 207, 0: 207})
SMOTETomek Results:
              precision    recall  f1-score   support

           0     0.5000    0.5000    0.5000         2
           1     0.9833    0.9833    0.9833        60

    accuracy                         0.9677        62
   macro avg     0.7417    0.7417    0.7417        62
weighted avg     0.9677    0.9677    0.9677        62



In [15]:
from imblearn.over_sampling import ADASYN

adasyn = ADASYN(random_state=42)
X_resampled, y_resampled = adasyn.fit_resample(X_train, y_train)
# Display resampled class distribution
print(f"Resampled class distribution: {Counter(y_resampled)}")

# Training a logistic regression model on the resampled dataset
model = LogisticRegression(random_state=42)
model.fit(X_resampled, y_resampled)
y_pred = model.predict(X_test)

# Evaluating the model
print("ADASYN Results:")
print(classification_report(y_test, y_pred, digits=4))

Resampled class distribution: Counter({1: 210, 0: 207})
ADASYN Results:
              precision    recall  f1-score   support

           0     0.5000    0.5000    0.5000         2
           1     0.9833    0.9833    0.9833        60

    accuracy                         0.9677        62
   macro avg     0.7417    0.7417    0.7417        62
weighted avg     0.9677    0.9677    0.9677        62



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
