In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.model_selection import learning_curve
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Set random seed for reproducibility
RANDOM_STATE = 42

In [4]:
# 1. Loading the Dataset
print("Loading dataset...")
df = pd.read_csv("Duolingo_data_03_07_final_standardized.csv")
pd.set_option('display.max_columns', None)
df

Loading dataset...


FileNotFoundError: [Errno 2] No such file or directory: 'Duolingo_data_03_07_final_standardized.csv'

In [None]:
# Display basic information about the dataset
print("Dataset shape:", df.shape)
print("\nDataset info:")
print(df.info())

print("\nFirst 5 rows of the dataset:")
print(df.head())

print("\nSummary statistics:")
print(df.describe())

In [None]:
# 2. Data Preprocessing
print("\nChecking for missing values:")
print(df.isnull().sum())

# Convert churn_time_category to categorical target (binary classification)
# Let's consider categories 3 and 4 as "will churn" (1) and categories 1 and 2 as "will not churn" (0)
# df['will_churn'] = df['churn_time_category'].apply(lambda x: 1 if x >= 3 else 0)

print("\nTarget distribution:")
print(df['churn_time_category'].value_counts())
# print(df['1'].value_counts(normalize=True).map(lambda x: f"{x:.2%}"))

In [None]:
# 3. Feature Selection
# Exclude the original target, churn_time (which would cause data leakage), and any other columns we don't want as features
features = [col for col in df.columns if col not in ['churn_time_category', 'user_id', 'churn_time']]
X = df[features]
y = df['churn_time_category']

print("\nSelected features:", features)

In [None]:
# 4. Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
print(f"\nTraining samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")

In [None]:
# due to imbalanced dataset, use stratified train-test split
# X contains your features, y contains your target variable (churn_time_category)
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2,                 # 20% of data for testing
    random_state=RANDOM_STATE,     # For reproducibility
    stratify=y                     # This is the key parameter for stratified sampling
)

In [None]:
# Verify
# Before splitting
print("Original class distribution:")
print(y.value_counts(normalize=True))

# After splitting
print("\nTraining set class distribution:")
print(y_train.value_counts(normalize=True))

print("\nTesting set class distribution:")
print(y_test.value_counts(normalize=True))

## Train basic model

In [2]:
# Create DMatrix for faster processing
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Define parameters
params = {
    'objective': 'multi:softmax',  # multiclass classification
    'num_class': 3,                # 4 classes (1, 2, 3)
    'eta': 0.1,                    # learning rate
    'max_depth': 6,                # max depth of trees
    'subsample': 0.8,              # subsample ratio
    'colsample_bytree': 0.8,       # feature subsample ratio
    'min_child_weight': 1,         # min sum of instance weight needed in a child
    'eval_metric': 'mlogloss',     # evaluation metric for multiclass
    'seed': RANDOM_STATE,          # for reproducibility
    'tree_method': 'hist'          # faster tree construction algorithm
}

# Train the model
print("\nTraining a basic XGBoost model...")
num_rounds = 100
xgb_model = xgb.train(params, dtrain, num_rounds)

NameError: name 'xgb' is not defined

In [None]:
# Calculate training accuracy
y_train_pred = xgb_model.predict(dtrain)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Training accuracy: {train_accuracy:.4f}")

In [None]:
# Evaluate the basic model
y_pred = xgb_model.predict(dtest)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nBasic XGBoost model accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
# Visualize the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['1', '2', '3'],
            yticklabels=['1', '2', '3'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Basic XGBoost Model')
plt.show()

In [None]:
# XGBoost provides multiple importance measures - use 'gain'
importance_gain = xgb_model.get_score(importance_type='gain')

# Create dataframe for visualization
feature_importance = pd.DataFrame({
    'Feature': list(importance_gain.keys()),
    'Importance': list(importance_gain.values())
})
feature_importance = feature_importance.sort_values('Importance', ascending=False)

print("\nFeature Importance (Gain):")
print(feature_importance)

# Visualize feature importance
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance[:15])  # Show top 15 features
plt.title('XGBoost Feature Importance (Gain)', fontsize=14)
plt.tight_layout()
plt.show()

## CV tuning

In [None]:
# Use sklearn's XGBClassifier for easier integration with GridSearchCV
print("\nPerforming hyperparameter tuning...")

# Create XGBClassifier
xgb_clf = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=3,
    random_state=RANDOM_STATE,
    tree_method='hist',
    eval_metric='mlogloss'
)

# Setup CV with stratification
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

# Use a reduced parameter grid for demonstration - in practice, you might want to try more combinations
reduced_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6],
    'learning_rate': [0.1, 0.2],
    'subsample': [0.8, 1.0]
}

grid_search = GridSearchCV(
    estimator=xgb_clf,
    param_grid=reduced_param_grid,  # Use reduced grid for faster computation
    cv=cv,
    n_jobs=-1,
    verbose=1,
    scoring='accuracy'
)

grid_search.fit(X_train, y_train)

print("\nBest parameters:", grid_search.best_params_)
print("Best cross-validation score: {:.4f}".format(grid_search.best_score_))

In [None]:
# Feature Importance for the best model from GridSearchCV
best_xgb_model = grid_search.best_estimator_

# Get feature importance
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': best_xgb_model.feature_importances_
})
feature_importance = feature_importance.sort_values('Importance', ascending=False)

# Create visualization
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance[:15])  # Show top 15 features
plt.title('Feature Importance (Best XGBoost Model After Tuning)', fontsize=16)
plt.xlabel('Importance', fontsize=14)
plt.ylabel('Feature', fontsize=14)
plt.tight_layout()
plt.show()

# Print top 15 features
print("Top 15 most important features:")
print(feature_importance.head(15))

In [None]:
# Evaluate the tuned model
y_pred_tuned = best_xgb_model.predict(X_test)
accuracy_tuned = accuracy_score(y_test, y_pred_tuned)
print(f"\nTuned XGBoost model accuracy: {accuracy_tuned:.4f}")

print("\nClassification Report (Tuned Model):")
print(classification_report(y_test, y_pred_tuned))

# Confusion Matrix for tuned model
cm_tuned = confusion_matrix(y_test, y_pred_tuned)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_tuned, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['1', '2', '3'],
            yticklabels=['1', '2', '3'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Tuned XGBoost Model')
plt.show()

In [None]:
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier

# Binarize the output
y_test_bin = label_binarize(y_test, classes=[1, 2, 3])
n_classes = y_test_bin.shape[1]

# Get probability predictions
y_score = best_xgb_model.predict_proba(X_test)

# Plot ROC curves
plt.figure(figsize=(8, 8))

# Add diagonal reference line (random classifier)
plt.plot([0, 1], [0, 1], 'k--', lw=2, label='Random Classifier (AUC = 0.5)')

# Plot ROC curve for each class with different colors
colors = ['blue', 'red', 'green', 'purple']
class_names = ['Churn Category 1', 'Churn Category 2', 'Churn Category 3']

for i in range(n_classes):
    fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_score[:, i])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, color=colors[i], lw=2, 
             label=f'{class_names[i]} (AUC = {roc_auc:.2f})')

# Add labels, title and other visual elements
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('Multiclass ROC Curve (XGBoost)', fontsize=14)
plt.legend(loc='lower right', fontsize=10)
plt.grid(True, linestyle='--', alpha=0.7)

# Set axis limits
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])

plt.tight_layout()
plt.show()

In [None]:
# Learning Curves
print("\nGenerating learning curves...")

# Use StratifiedKFold for CV
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)

# Define train sizes
train_sizes = np.linspace(0.1, 1.0, 10)

# Use best XGBoost model from grid search
train_sizes, train_scores, test_scores = learning_curve(
    best_xgb_model, X, y, 
    cv=cv,  
    n_jobs=-1, 
    train_sizes=train_sizes,
    scoring='accuracy'
)

# Calculate mean and std
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

# Plot learning curve
plt.figure(figsize=(10, 6))
plt.plot(train_sizes, train_mean, 'o-', color='r', label='Training score')
plt.plot(train_sizes, test_mean, 'o-', color='g', label='Cross-validation score')
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color='r')
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1, color='g')
plt.xlabel('Training Examples')
plt.ylabel('Score')
plt.title('Learning Curves for XGBoost (Multiclass Classification)')
plt.legend(loc='best')
plt.grid(True)
plt.ylim([0.4, 1.01])  # Set y-axis limits for better visualization
plt.show()