# Feature Engineering
Create new features, handle missing values, perform one-hot encoding, and scale/normalize features using techniques like StandardScaler or MinMaxScaler.

In [5]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
path = 'C:/Users/MSI/.cache/kagglehub/datasets/blastchar/telco-customer-churn/versions/1'
# Load dataset
data = pd.read_csv(f"{path}/WA_Fn-UseC_-Telco-Customer-Churn.csv")

# Handle missing values
numeric_features = data.select_dtypes(include=['int64', 'float64']).columns
categorical_features = data.select_dtypes(include=['object']).columns

# Impute numeric features with mean strategy
imputer_numeric = SimpleImputer(strategy='mean')
data[numeric_features] = imputer_numeric.fit_transform(data[numeric_features])

# Impute categorical features with most_frequent strategy
imputer_categorical = SimpleImputer(strategy='most_frequent')
data[categorical_features] = imputer_categorical.fit_transform(data[categorical_features])

data_imputed = data.copy()

# One-hot encode categorical features
categorical_features = data_imputed.select_dtypes(include=['object']).columns
encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_features = pd.DataFrame(encoder.fit_transform(data_imputed[categorical_features]), columns=encoder.get_feature_names_out(categorical_features))

# Drop original categorical features and concatenate encoded features
data_imputed = data_imputed.drop(categorical_features, axis=1)
data_encoded = pd.concat([data_imputed, encoded_features], axis=1)

# Scale/normalize features
scaler = StandardScaler()
data_scaled = pd.DataFrame(scaler.fit_transform(data_encoded), columns=data_encoded.columns)

# Display the first few rows of the processed data
data_scaled.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,customerID_0003-MKNFE,customerID_0004-TLHLJ,customerID_0011-IGKFF,customerID_0013-EXCHZ,customerID_0013-MHZWF,customerID_0013-SMEOE,customerID_0014-BMAQU,...,TotalCharges_996.45,TotalCharges_996.85,TotalCharges_996.95,TotalCharges_997.65,TotalCharges_997.75,TotalCharges_998.1,TotalCharges_999.45,TotalCharges_999.8,TotalCharges_999.9,Churn_Yes
0,-0.439916,-1.277445,-1.160323,-0.011917,-0.011917,-0.011917,-0.011917,-0.011917,-0.011917,-0.011917,...,-0.011917,-0.011917,-0.011917,-0.011917,-0.011917,-0.011917,-0.011917,-0.011917,-0.011917,-0.601023
1,-0.439916,0.066327,-0.259629,-0.011917,-0.011917,-0.011917,-0.011917,-0.011917,-0.011917,-0.011917,...,-0.011917,-0.011917,-0.011917,-0.011917,-0.011917,-0.011917,-0.011917,-0.011917,-0.011917,-0.601023
2,-0.439916,-1.236724,-0.36266,-0.011917,-0.011917,-0.011917,-0.011917,-0.011917,-0.011917,-0.011917,...,-0.011917,-0.011917,-0.011917,-0.011917,-0.011917,-0.011917,-0.011917,-0.011917,-0.011917,1.663829
3,-0.439916,0.514251,-0.746535,-0.011917,-0.011917,-0.011917,-0.011917,-0.011917,-0.011917,-0.011917,...,-0.011917,-0.011917,-0.011917,-0.011917,-0.011917,-0.011917,-0.011917,-0.011917,-0.011917,-0.601023
4,-0.439916,-1.236724,0.197365,-0.011917,-0.011917,-0.011917,-0.011917,-0.011917,-0.011917,-0.011917,...,-0.011917,-0.011917,-0.011917,-0.011917,-0.011917,-0.011917,-0.011917,-0.011917,-0.011917,1.663829


# Feature Correlation Analysis
Generate correlation matrix, create heatmap visualization, identify and handle multicollinearity between features.

In [None]:
# Feature Correlation Analysis

import seaborn as sns
import matplotlib.pyplot as plt

# Generate correlation matrix
correlation_matrix = data_scaled.corr()

# Create heatmap visualization
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Feature Correlation Matrix')
plt.show()

# Identify and handle multicollinearity
# We will drop one of the highly correlated features (correlation > 0.8)
threshold = 0.8
high_corr_var = np.where(correlation_matrix > threshold)
high_corr_var = [(correlation_matrix.columns[x], correlation_matrix.columns[y]) for x, y in zip(*high_corr_var) if x != y and x < y]

# Drop one of each pair of highly correlated features
features_to_drop = set()
for var1, var2 in high_corr_var:
    features_to_drop.add(var2)

data_reduced = data_scaled.drop(columns=features_to_drop)

# Display the first few rows of the reduced data
data_reduced.head()

# Model Training
Split data into training and testing sets, initialize the model, train it on the training data, and make predictions.

In [None]:
# Model Training

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Split data into features and target variable
X = data_reduced.drop('target', axis=1)
y = data_reduced['target']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the model
model = LogisticRegression()

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Display classification report
print(classification_report(y_test, y_pred))

# Model Evaluation
Calculate accuracy, precision, recall, F1-score, and generate confusion matrix. Plot ROC curve and calculate AUC score.

In [None]:
# Model Evaluation

from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate precision, recall, and F1-score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-score: {f1:.2f}')

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Plot ROC curve and calculate AUC score
y_prob = model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

print(f'AUC score: {roc_auc:.2f}')

# Model Performance Analysis
Analyze learning curves to detect overfitting/underfitting, perform cross-validation, and validate model performance on different data splits.

In [None]:
# Model Performance Analysis

from sklearn.model_selection import learning_curve, cross_val_score
import numpy as np

# Function to plot learning curves
def plot_learning_curves(estimator, X, y):
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=5, scoring='accuracy', n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 10))
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.figure(figsize=(10, 6))
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
    plt.xlabel('Training examples')
    plt.ylabel('Score')
    plt.legend(loc='best')
    plt.title('Learning Curves')
    plt.show()

# Plot learning curves for the model
plot_learning_curves(model, X_train, y_train)

# Perform cross-validation
cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print(f'Cross-validation scores: {cv_scores}')
print(f'Mean cross-validation score: {cv_scores.mean():.2f}')

# Evaluate model performance on different data splits
train_sizes = [0.6, 0.7, 0.8]
for train_size in train_sizes:
    X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(X, y, train_size=train_size, random_state=42)
    model.fit(X_train_split, y_train_split)
    y_pred_split = model.predict(X_test_split)
    accuracy_split = accuracy_score(y_test_split, y_pred_split)
    print(f'Train size: {train_size}, Accuracy: {accuracy_split:.2f}')