In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [4]:
# Load the dataset
file_path = "Telecom_Churn_data.csv"  # Change path if needed
df = pd.read_csv(file_path)

In [5]:
# Encode categorical variables
df["International plan"] = df["International plan"].map({"Yes": 1, "No": 0})
df["Voice mail plan"] = df["Voice mail plan"].map({"Yes": 1, "No": 0})
df["Churn"] = df["Churn"].astype(int)  # Convert boolean to 0/1

In [6]:
# Drop unnecessary columns
df.drop(columns=["State"], inplace=True)  # 'Phone Number' not included in dataset

In [7]:
# Handle missing values (choose one method)
df.dropna(inplace=True)  # Option 1: Remove rows with NaN values
# df.fillna(df.mean(), inplace=True)  # Option 2: Fill missing values with mean

In [8]:
# Define features and target
X = df.drop(columns=["Churn"])
y = df["Churn"]

In [9]:
# Split data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
# Initialize models
models = {
    "Logistic Regression": LogisticRegression(solver="newton-cg", max_iter=1000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(kernel="linear", random_state=42)
}

In [20]:
# Train and evaluate models
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1-score": f1_score(y_test, y_pred)
    }

In [21]:
# Convert results to DataFrame and display
results_df = pd.DataFrame(results).T
print("Model Performance:\n", results_df)

Model Performance:
                      Accuracy  Precision    Recall  F1-score
Logistic Regression  0.861423   0.580645  0.227848  0.327273
Decision Tree        0.911985   0.735294  0.632911  0.680272
Random Forest        0.949438   0.981481  0.670886  0.796992
SVM                  0.865169   1.000000  0.088608  0.162791


In [22]:
# Export predictions for Tableau
best_model = RandomForestClassifier(n_estimators=100, random_state=42)
best_model.fit(X_train, y_train)
df["Predicted Churn"] = best_model.predict(X)

In [23]:
# Save to CSV for Tableau
df.to_csv("churn_predictions_for_tableau.csv", index=False)
print("Predictions exported successfully for Tableau.")

Predictions exported successfully for Tableau.
