In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from imblearn.under_sampling import TomekLinks

In [3]:
data = pd.read_csv('Customer-Churn.csv')

# Preprocess the data
X = data.drop('Churn', axis=1)
y = data['Churn']

In [4]:
# One-hot encode categorical variables
categorical_cols = X.select_dtypes(include='object').columns
encoder = OneHotEncoder()
X_encoded = pd.DataFrame(encoder.fit_transform(X[categorical_cols]).toarray(),
                         columns=encoder.get_feature_names_out(categorical_cols))
X_encoded = pd.concat([X.drop(categorical_cols, axis=1), X_encoded], axis=1)


In [5]:
# Apply SMOTE for upsampling
smote = SMOTE()
X_smote, y_smote = smote.fit_resample(X_encoded, y)

In [6]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size=0.2, random_state=42)


In [7]:
# Train the logistic regression model
logreg = LogisticRegression()
logreg.fit(X_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
# Evaluate the logistic regression model
logreg_pred = logreg.predict(X_test)
logreg_accuracy = accuracy_score(y_test, logreg_pred)
print("Logistic Regression Accuracy:", logreg_accuracy)

Logistic Regression Accuracy: 0.7782608695652173


In [9]:
# Train the decision tree classifier
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

In [10]:
# Evaluate the decision tree classifier
dt_pred = dt.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_pred)
print("Decision Tree Classifier Accuracy:", dt_accuracy)

Decision Tree Classifier Accuracy: 0.8555555555555555


In [11]:
# Compare the accuracies
if logreg_accuracy > dt_accuracy:
    print("Logistic Regression model has a higher accuracy.")
elif dt_accuracy > logreg_accuracy:
    print("Decision Tree Classifier has a higher accuracy.")
else:
    print("Both models have the same accuracy.")

Decision Tree Classifier has a higher accuracy.


## Apply TomekLinks for downsampling

##It is important to remember that it does not make the two classes equal but only removes the points from the majority class that are close to other points in minority class.
##Use logistic regression to fit the model and compute the accuracy of the model.
##Use decision tree classifier to fit the model and compute the accuracy of the model.
##Compare the accuracies of the two models.
##You can also apply this algorithm one more time and check the how the imbalance in the two classes changed from the last time.

In [12]:
# Load the data again 
data_2 = pd.read_csv('Customer-Churn.csv')

# Step 2: Preprocess the data
X = data_2.drop('Churn', axis=1)
y = data_2['Churn']



In [13]:
# One-hot encode categorical variables
categorical_cols = X.select_dtypes(include='object').columns
encoder = OneHotEncoder()
X_encoded = pd.DataFrame(encoder.fit_transform(X[categorical_cols]).toarray(),
                         columns=encoder.get_feature_names_out(categorical_cols))
X_encoded = pd.concat([X.drop(categorical_cols, axis=1), X_encoded], axis=1)


In [15]:
tomek_links = TomekLinks()
X_train_undersampled, y_train_undersampled = tomek_links.fit_resample(X_train, y_train)


In [17]:
# Fit and evaluate logistic regression model
logreg = LogisticRegression()
logreg.fit(X_train_undersampled, y_train_undersampled)
y_pred_logreg = logreg.predict(X_test)
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
print("Accuracy of logistic regression model:", accuracy_logreg)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy of logistic regression model: 0.7830917874396135


In [18]:
# Fit and evaluate decision tree classifier model
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train_undersampled, y_train_undersampled)
y_pred_dt = dt_classifier.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print("Accuracy of decision tree classifier model:", accuracy_dt)

Accuracy of decision tree classifier model: 0.8560386473429952


In [19]:
# Compare the accuracies of the two models
print("Accuracy comparison:")
print("- Logistic Regression:", accuracy_logreg)
print("- Decision Tree Classifier:", accuracy_dt)


Accuracy comparison:
- Logistic Regression: 0.7830917874396135
- Decision Tree Classifier: 0.8560386473429952


In [20]:
# Apply TomekLinks one more time to check changes in class imbalance
X_train_undersampled2, y_train_undersampled2 = tomek_links.fit_resample(X_train_undersampled, y_train_undersampled)
class_counts_before = pd.Series(y_train_undersampled.value_counts(), name="Class Counts Before")
class_counts_after = pd.Series(y_train_undersampled2.value_counts(), name="Class Counts After")
print("\nClass Imbalance Comparison:")
print(class_counts_before)
print(class_counts_after)


Class Imbalance Comparison:
Yes    4125
No     4078
Name: Class Counts Before, dtype: int64
Yes    4108
No     4078
Name: Class Counts After, dtype: int64
