In [6]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Load the dataset
df = pd.read_csv('/content/WA_Fn-UseC_-Telco-Customer-Churn.csv')

# Preprocessing
# Converting 'TotalCharges' to numeric and handling missing values
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(df['TotalCharges'].mean(), inplace=True)

# Converting categorical columns to numerical using Label Encoding
df['Churn'] = df['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)

# One-hot encoding categorical variables
df_encoded = pd.get_dummies(df.drop(columns=['customerID']), drop_first=True)

# Split dataset into features and target
X = df_encoded.drop(columns=['Churn'])
y = df_encoded['Churn']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Model Training - Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predictions
y_pred = rf_model.predict(X_test)

# Evaluation
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
roc_auc = roc_auc_score(y_test, rf_model.predict_proba(X_test)[:,1])

# Output Results
print("Confusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)
print("\nROC AUC Score:", roc_auc)


Confusion Matrix:
 [[1406  133]
 [ 303  271]]

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.91      0.87      1539
           1       0.67      0.47      0.55       574

    accuracy                           0.79      2113
   macro avg       0.75      0.69      0.71      2113
weighted avg       0.78      0.79      0.78      2113


ROC AUC Score: 0.8390001652731648


In [8]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [None]:
# Let's predict the output on the test set and show the first few predictions along with the actual values.

# Predict probabilities for the test set
y_pred_proba = rf_model.predict_proba(X_test)[:,1]

# Predicted binary outcomes
y_pred = rf_model.predict(X_test)

# Creating a DataFrame to compare the actual vs predicted values
prediction_df = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred,
    'Predicted Probability': y_pred_proba
}).reset_index(drop=True)

# Display the first few rows of predictions
prediction_df.head()


Unnamed: 0,Actual,Predicted,Predicted Probability
0,1,1,0.6
1,0,0,0.06
2,0,0,0.01
3,1,1,0.61
4,0,0,0.02


In [9]:
new_data = {
    'gender': 'Female',
    'SeniorCitizen': 0,
    'Partner': 'Yes',
    'Dependents': 'No',
    'tenure': 5,
    'PhoneService': 'Yes',
    'MultipleLines': 'No',
    'InternetService': 'Fiber optic',
    'OnlineSecurity': 'No',
    'OnlineBackup': 'Yes',
    'DeviceProtection': 'No',
    'TechSupport': 'No',
    'StreamingTV': 'Yes',
    'StreamingMovies': 'Yes',
    'Contract': 'Month-to-month',
    'PaperlessBilling': 'Yes',
    'PaymentMethod': 'Electronic check',
    'MonthlyCharges': 75.50,
    'TotalCharges': 377.50
}

# Convert to DataFrame
new_customer_df = pd.DataFrame([new_data])

# One-hot encode the new customer data
new_customer_encoded = pd.get_dummies(new_customer_df, drop_first=True)

# Ensure new data has the same columns as training data (fill missing columns with 0)
new_customer_encoded = new_customer_encoded.reindex(columns=X.columns, fill_value=0)

# Scale the new customer data
new_customer_scaled = scaler.transform(new_customer_encoded)
# Predict churn probability for the new customer
churn_prob = rf_model.predict_proba(new_customer_scaled)[:, 1]

# Predict binary churn outcome (1 for churn, 0 for no churn)
churn_pred = rf_model.predict(new_customer_scaled)

# Display prediction results
print(f"Churn Prediction: {churn_pred[0]}")  # 1 means likely to churn, 0 means not likely to churn
print(f"Churn Probability: {churn_prob[0]}")


Churn Prediction: 0
Churn Probability: 0.45


# **AUTOML TPOT**

In [10]:
pip install tpot


Collecting tpot
  Downloading TPOT-0.12.2-py3-none-any.whl.metadata (2.0 kB)
Collecting deap>=1.2 (from tpot)
  Downloading deap-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting update-checker>=0.16 (from tpot)
  Downloading update_checker-0.18.0-py3-none-any.whl.metadata (2.3 kB)
Collecting stopit>=1.1.1 (from tpot)
  Downloading stopit-1.1.2.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading TPOT-0.12.2-py3-none-any.whl (87 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.4/87.4 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading deap-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (135 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.4/135.4 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Building wheel

In [11]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tpot import TPOTClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Load the dataset
df = pd.read_csv('/content/WA_Fn-UseC_-Telco-Customer-Churn.csv')

# Preprocessing
# Converting 'TotalCharges' to numeric and handling missing values
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(df['TotalCharges'].mean(), inplace=True)

# Converting categorical columns to numerical
df['Churn'] = df['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)

# One-hot encoding categorical variables
df_encoded = pd.get_dummies(df.drop(columns=['customerID']), drop_first=True)

# Split dataset into features and target
X = df_encoded.drop(columns=['Churn'])
y = df_encoded['Churn']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Model Training - TPOT
tpot = TPOTClassifier(verbosity=2, random_state=42, generations=5, population_size=20)
tpot.fit(X_train, y_train)

# Predictions
y_pred = tpot.predict(X_test)

# Evaluation
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
roc_auc = roc_auc_score(y_test, tpot.predict_proba(X_test)[:,1])

# Output Results
print("Confusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)
print("\nROC AUC Score:", roc_auc)

# New customer data for prediction
new_data = {
    'gender': 'Female',
    'SeniorCitizen': 0,
    'Partner': 'Yes',
    'Dependents': 'No',
    'tenure': 5,
    'PhoneService': 'Yes',
    'MultipleLines': 'No',
    'InternetService': 'Fiber optic',
    'OnlineSecurity': 'No',
    'OnlineBackup': 'Yes',
    'DeviceProtection': 'No',
    'TechSupport': 'No',
    'StreamingTV': 'Yes',
    'StreamingMovies': 'Yes',
    'Contract': 'Month-to-month',
    'PaperlessBilling': 'Yes',
    'PaymentMethod': 'Electronic check',
    'MonthlyCharges': 75.50,
    'TotalCharges': 377.50
}

# Convert to DataFrame
new_customer_df = pd.DataFrame([new_data])

# One-hot encode the new customer data
new_customer_encoded = pd.get_dummies(new_customer_df, drop_first=True)

# Ensure new data has the same columns as training data (fill missing columns with 0)
new_customer_encoded = new_customer_encoded.reindex(columns=X.columns, fill_value=0)

# Scale the new customer data
new_customer_scaled = scaler.transform(new_customer_encoded)

# Predict churn probability for the new customer
churn_prob = tpot.predict_proba(new_customer_scaled)[:, 1]

# Predict binary churn outcome (1 for churn, 0 for no churn)
churn_pred = tpot.predict(new_customer_scaled)

# Display prediction results
print(f"Churn Prediction: {churn_pred[0]}")  # 1 means likely to churn, 0 means not likely to churn
print(f"Churn Probability: {churn_prob[0]}")


Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.8032454361054766

Generation 2 - Current best internal CV score: 0.8046653144016227

Generation 3 - Current best internal CV score: 0.8046653144016227

Generation 4 - Current best internal CV score: 0.8046653144016227

Generation 5 - Current best internal CV score: 0.8046653144016227

Best pipeline: ExtraTreesClassifier(input_matrix, bootstrap=True, criterion=entropy, max_features=0.8500000000000001, min_samples_leaf=19, min_samples_split=5, n_estimators=100)
Confusion Matrix:
 [[1412  127]
 [ 281  293]]

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.92      0.87      1539
           1       0.70      0.51      0.59       574

    accuracy                           0.81      2113
   macro avg       0.77      0.71      0.73      2113
weighted avg       0.80      0.81      0.80      2113


ROC AUC Score: 0.8574439712651095
Churn Prediction: 0
Churn Probability: 0.337019020150