# **Customer Churn Prediction**
# Import Libraries

In [None]:


# %%
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load and Inspect Data

In [1]:
df = pd.read_csv('../data/telco_churn.csv')
print("Dataset shape:", df.shape)
df.head()

NameError: name 'pd' is not defined


# Exploratory Data Analysis (EDA)

In [2]:
# Basic statistics
print(df.describe())
print("\nMissing values:")
print(df.isna().sum())

# Churn distribution
plt.figure(figsize=(6, 4))
sns.countplot(x='Churn', data=df)
plt.title('Churn Distribution')
plt.show()

NameError: name 'df' is not defined

# Feature Engineering

In [None]:
def create_features(df):
    # Convert TotalCharges to numeric
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

    # Create new features
    df['MonthlyChargesPerTenure'] = df['MonthlyCharges'] / df['tenure']
    df['HasInternet'] = df['InternetService'].apply(lambda x: 1 if x != 'No' else 0)

    # Handle missing values
    df['TotalCharges'].fillna(0, inplace=True)

    return df

df = create_features(df)

# Data Preprocessing

In [3]:
# Separate features and target
X = df.drop('Churn', axis=1)
y = df['Churn'].map({'Yes': 1, 'No': 0})

# Identify categorical and numerical columns
cat_cols = X.select_dtypes(include=['object']).columns
num_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ])

NameError: name 'df' is not defined

# Train-Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(f"Training samples: {X_train.shape[0]}")
print(f"Test samples: {X_test.shape[0]}")

# Model Pipeline

In [4]:
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(
        n_estimators=200,
        max_depth=5,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric='logloss'
    ))
])

NameError: name 'Pipeline' is not defined

# Model Training

In [None]:
model.fit(X_train, y_train)



# Model Evaluation

In [5]:
# Make predictions
preds = model.predict(X_test)

# Generate reports
print(f"Accuracy: {accuracy_score(y_test, preds):.2f}")
print("\nClassification Report:")
print(classification_report(y_test, preds))

NameError: name 'model' is not defined

# Feature Importance

In [6]:
# Get feature names from preprocessing
cat_features = model.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(cat_cols)
all_features = list(num_cols) + list(cat_features)

# Plot importance
plt.figure(figsize=(12, 8))
feature_importances = pd.Series(
    model.named_steps['classifier'].feature_importances_,
    index=all_features
).sort_values(ascending=False)[:15]

sns.barplot(x=feature_importances, y=feature_importances.index)
plt.title('Top 15 Feature Importances')
plt.xlabel('Importance Score')
plt.show()

NameError: name 'model' is not defined