In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, StackingClassifier


In [2]:
# Upload the dataset files
train_file_path = '/content/Cell Phone Churn-TRAIN.csv'
test_file_path = '/content/Cell Phone Churn-TEST.csv'

train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)


In [6]:
print(train_df.columns)
print(test_df.columns)


Index(['MOUMO', 'MOUPMO', 'MOU3MO', 'MOUCH1M', 'MOUCH3M', 'CUM3MCH', 'CUSTMOS',
       'LONGD', 'CALLW', 'LINES', 'VOICEM', 'CELL', 'CONVB', 'SEX', 'INCOME',
       'SPORTS', 'NATURE', 'ARTS', 'HRS_TV', 'TRAVEL', 'EDUC', 'TOTMOU',
       'TOTCHNG', 'TARGET'],
      dtype='object')
Index(['MOUMO', 'MOUPMO', 'MOU3MO', 'MOUCH1M', 'MOUCH3M', 'CUM3MCH', 'CUSTMOS',
       'LONGD', 'CALLW', 'LINES', 'VOICEM', 'CELL', 'CONVB', 'SEX', 'INCOME',
       'SPORTS', 'NATURE', 'ARTS', 'HRS_TV', 'TRAVEL', 'EDUC', 'TOTMOU',
       'TOTCHNG', 'TARGET'],
      dtype='object')


In [7]:
# Separate features and target
X_train = train_df.drop('TARGET', axis=1)
y_train = train_df['TARGET']
X_test = test_df.drop('TARGET', axis=1)
y_test = test_df['TARGET']


In [8]:
# Initialize classifiers
classifiers = {
    "MLP": MLPClassifier(random_state=42),
    "LinearSVC": LinearSVC(random_state=42, max_iter=10000),  # Increase max_iter to avoid convergence issues
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "KNN": KNeighborsClassifier()
}

# Dictionary to store results
results = []

# Train and evaluate each classifier
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    results.append({
        "Classifier": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average='binary'),
        "Recall": recall_score(y_test, y_pred, average='binary'),
        "F1": f1_score(y_test, y_pred, average='binary')
    })

# Display results
results_df = pd.DataFrame(results)
print(results_df)


     Classifier  Accuracy  Precision    Recall        F1
0           MLP  0.733333    0.97619  0.292857  0.450549
1     LinearSVC  0.722667    0.75000  0.385714  0.509434
2  DecisionTree  0.994667    1.00000  0.985714  0.992806
3           KNN  0.994667    1.00000  0.985714  0.992806


In [9]:
from sklearn.model_selection import RandomizedSearchCV

# Parameter grid for Decision Tree
param_dist = {
    "max_depth": [3, 5, 10, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "criterion": ["gini", "entropy"]
}

# Perform RandomizedSearchCV
dt_random_search = RandomizedSearchCV(
    DecisionTreeClassifier(random_state=42),
    param_distributions=param_dist,
    n_iter=20,
    scoring='f1',
    cv=3,
    random_state=42
)
dt_random_search.fit(X_train, y_train)

# Best Decision Tree model
best_dt = dt_random_search.best_estimator_
print("Best Parameters:", dt_random_search.best_params_)

# Evaluate the best Decision Tree on test data
y_pred_tuned = best_dt.predict(X_test)
results.append({
    "Classifier": "DecisionTree (Tuned)",
    "Accuracy": accuracy_score(y_test, y_pred_tuned),
    "Precision": precision_score(y_test, y_pred_tuned, average='binary'),
    "Recall": recall_score(y_test, y_pred_tuned, average='binary'),
    "F1": f1_score(y_test, y_pred_tuned, average='binary')
})


Best Parameters: {'min_samples_split': 10, 'min_samples_leaf': 4, 'max_depth': 10, 'criterion': 'entropy'}


In [13]:
# Extract feature importances from the tuned Decision Tree
feature_importances = pd.Series(best_dt.feature_importances_, index=X_train.columns)

# Identify the top 5 most important features
top_features = feature_importances.sort_values(ascending=False).head(5)

# Display the top 5 features
print("Top 5 Features and Their Importance Scores:")
print(top_features)


Top 5 Features and Their Importance Scores:
MOUCH3M    0.449018
INCOME     0.213672
CONVB      0.062632
MOUPMO     0.054220
TOTCHNG    0.051970
dtype: float64


In [17]:
# Convert top features to a string for inclusion in results
top_features_str = ", ".join([f"{feature}: {importance:.2f}" for feature, importance in top_features.items()])

# Append feature importance as a separate row in the results
results.append({
    "Classifier": "Top 5 Features",
    "Accuracy": "N/A",
    "Precision": "N/A",
    "Recall": "N/A",
    "F1": "N/A",
    "Top Features": top_features_str  # Add top features here
})


In [18]:
# Update results DataFrame
results_df = pd.DataFrame(results)

# Save the updated results table
results_df.to_csv("classification_results_with_features.csv", index=False)

# Print the updated results DataFrame
print(results_df)


             Classifier  Accuracy Precision    Recall        F1  \
0                   MLP  0.733333   0.97619  0.292857  0.450549   
1             LinearSVC  0.722667      0.75  0.385714  0.509434   
2          DecisionTree  0.994667       1.0  0.985714  0.992806   
3                   KNN  0.994667       1.0  0.985714  0.992806   
4  DecisionTree (Tuned)  0.994667       1.0  0.985714  0.992806   
5              Stacking  0.994667       1.0  0.985714  0.992806   
6        Top 5 Features       N/A       N/A       N/A       N/A   

                                        Top Features  
0                                                NaN  
1                                                NaN  
2                                                NaN  
3                                                NaN  
4                                                NaN  
5                                                NaN  
6  MOUCH3M: 0.45, INCOME: 0.21, CONVB: 0.06, MOUP...  


In [19]:
results_df.loc[results_df["Classifier"] == "Top 5 Features", ["Accuracy", "Precision", "Recall", "F1"]] = "N/A"


In [10]:
from sklearn.ensemble import StackingClassifier, RandomForestClassifier

# Define base estimators
estimators = [
    ('mlp', MLPClassifier(random_state=42)),
    ('svc', LinearSVC(random_state=42, max_iter=10000)),
    ('dt', DecisionTreeClassifier(random_state=42)),
    ('knn', KNeighborsClassifier())
]

# Define stacking classifier
stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=RandomForestClassifier(random_state=42)
)

# Train and evaluate stacking classifier
stacking_clf.fit(X_train, y_train)
y_pred_stack = stacking_clf.predict(X_test)

# Evaluate stacking classifier
results.append({
    "Classifier": "Stacking",
    "Accuracy": accuracy_score(y_test, y_pred_stack),
    "Precision": precision_score(y_test, y_pred_stack, average='binary'),
    "Recall": recall_score(y_test, y_pred_stack, average='binary'),
    "F1": f1_score(y_test, y_pred_stack, average='binary')
})


In [21]:
# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Save results to a CSV file for use in Word
results_df.to_csv("classification_results.csv", index=False)

# Display the results
print(results_df)


             Classifier  Accuracy Precision    Recall        F1  \
0                   MLP  0.733333   0.97619  0.292857  0.450549   
1             LinearSVC  0.722667      0.75  0.385714  0.509434   
2          DecisionTree  0.994667       1.0  0.985714  0.992806   
3                   KNN  0.994667       1.0  0.985714  0.992806   
4  DecisionTree (Tuned)  0.994667       1.0  0.985714  0.992806   
5              Stacking  0.994667       1.0  0.985714  0.992806   
6        Top 5 Features       N/A       N/A       N/A       N/A   

                                        Top Features  
0                                                NaN  
1                                                NaN  
2                                                NaN  
3                                                NaN  
4                                                NaN  
5                                                NaN  
6  MOUCH3M: 0.45, INCOME: 0.21, CONVB: 0.06, MOUP...  


In [20]:
# Save the results table to a CSV file
results_df.to_csv("classification_results.csv", index=False)
