<a href="https://colab.research.google.com/github/LahariSivalasetty/newone/blob/customer_churn/customer_churn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# prompt: prompt: using random forest algorithm generate code for customer churn prediction for predictive analysis and genearte dattasets by importing for large datasewt'

import pandas as pd
import numpy as np
# To use a large dataset, you would typically load it from a file (e.g., CSV).
# For demonstration purposes, let's assume you have a CSV file named 'churn_dataset.csv'
# and modify the code to load from it.

# First, you might need to upload your file to Google Colab if it's on your local machine
# from google.colab import files
# uploaded = files.upload()
# for fn in uploaded.keys():
#   print('User uploaded file "{name}" with length {length} bytes'.format(
#       name=fn, length=len(uploaded[fn])))

# Or if the file is in your Google Drive, you can mount your Drive
# from google.colab import drive
# drive.mount('/content/drive')
# And then load from the Drive path, e.g., pd.read_csv('/content/drive/My Drive/churn_dataset.csv')

# For this example, let's keep the synthetic data generation part for reproducibility,
# but note that in a real scenario, you would replace this with loading your large dataset.
# Let's increase the number of samples to simulate a larger dataset within the synthetic generation.

# Generate a larger synthetic dataset
np.random.seed(42)
n_samples_large = 10000  # Increased number of samples to simulate a larger dataset
data_large = {
    'CustomerID': range(1, n_samples_large + 1),
    'Age': np.random.randint(18, 70, size=n_samples_large),
    'Gender': np.random.choice(['Male', 'Female'], size=n_samples_large, p=[0.55, 0.45]),
    'Tenure': np.random.randint(0, 72, size=n_samples_large), # Increased max tenure
    'MonthlyCharges': np.random.uniform(15, 120, size=n_samples_large).round(2), # Slightly wider range
    'TotalCharges': np.random.uniform(50, 8000, size=n_samples_large).round(2), # Wider range
    'ContractType': np.random.choice(['Month-to-month', 'One year', 'Two year'], size=n_samples_large, p=[0.55, 0.25, 0.20]), # Adjusted probabilities
    'InternetService': np.random.choice(['DSL', 'Fiber optic', 'No'], size=n_samples_large, p=[0.25, 0.50, 0.25]), # Fiber optic more common
    'PaymentMethod': np.random.choice(['Electronic check', 'Mailed check', 'Bank transfer (automatic)', 'Credit card (automatic)'], size=n_samples_large),
    'Churn': np.random.choice([0, 1], size=n_samples_large, p=[0.7, 0.3]) # Target variable (slightly higher churn rate)
}

df_large = pd.DataFrame(data_large)

# Introduce some complexity/patterns (same logic, applied to larger data)
df_large.loc[df_large['Tenure'] < 18, 'Churn'] = np.random.choice([0, 1], size=df_large.loc[df_large['Tenure'] < 18].shape[0], p=[0.45, 0.55])
df_large.loc[(df_large['MonthlyCharges'] > 90) & (df_large['ContractType'] == 'Month-to-month'), 'Churn'] = np.random.choice([0, 1], size=df_large.loc[(df_large['MonthlyCharges'] > 90) & (df_large['ContractType'] == 'Month-to-month')].shape[0], p=[0.35, 0.65])
df_large.loc[df_large['InternetService'] == 'Fiber optic', 'Churn'] = np.random.choice([0, 1], size=df_large.loc[df_large['InternetService'] == 'Fiber optic'].shape[0], p=[0.6, 0.4])
# Add a new pattern: higher age might correlate with lower churn
df_large.loc[df_large['Age'] > 55, 'Churn'] = np.random.choice([0, 1], size=df_large.loc[df_large['Age'] > 55].shape[0], p=[0.85, 0.15])


print(f"Generated dataset with {len(df_large)} rows.")
print("Dataset head:")
print(df_large.head())
print("\nChurn Distribution:")
print(df_large['Churn'].value_counts(normalize=True))


# Data Preprocessing
# Handle categorical features using one-hot encoding
# Drop CustomerID as it's not a feature
df_large_encoded = pd.get_dummies(df_large.drop('CustomerID', axis=1),
                                  columns=['Gender', 'ContractType', 'InternetService', 'PaymentMethod'],
                                  drop_first=True) # drop_first=True avoids multicollinearity

# Define features (X) and target (y)
X_large = df_large_encoded.drop('Churn', axis=1)
y_large = df_large_encoded['Churn']

# Split the data into training and testing sets
# Using a standard split, stratifying to maintain churn ratio in both sets
X_train_large, X_test_large, y_train_large, y_test_large = train_test_split(
    X_large, y_large, test_size=0.25, random_state=42, stratify=y_large) # Using 25% for testing

print(f"\nTraining set size: {len(X_train_large)}")
print(f"Testing set size: {len(X_test_large)}")

# Model Training - Random Forest

# Initialize the Random Forest Classifier
# Increased n_estimators for potentially better performance on a larger dataset,
# but be mindful of computation time.
# Using 'balanced_subsample' might be better than 'balanced' for very large datasets,
# but 'balanced' is generally fine for datasets up to tens of thousands.
rf_model_large = RandomForestClassifier(n_estimators=200, random_state=42,
                                        class_weight='balanced', n_jobs=-1) # n_jobs=-1 uses all available cores

print("\nTraining Random Forest model...")
# Train the model on the training data
rf_model_large.fit(X_train_large, y_train_large)
print("Model training complete.")

# Model Evaluation

print("\nEvaluating model performance...")
# Make predictions on the test set
y_pred_large = rf_model_large.predict(X_test_large)
y_proba_large = rf_model_large.predict_proba(X_test_large)[:, 1] # Probability of churning

# Evaluate the model
accuracy_large = accuracy_score(y_test_large, y_pred_large)
report_large = classification_report(y_test_large, y_pred_large)
confusion_large = confusion_matrix(y_test_large, y_pred_large)
roc_auc_large = roc_auc_score(y_test_large, y_proba_large)

print("Model Evaluation on Large Dataset:")
print(f"Accuracy: {accuracy_large:.4f}")
print("\nClassification Report:")
print(report_large)
print("\nConfusion Matrix:")
print(confusion_large)
print(f"\nROC AUC Score: {roc_auc_large:.4f}")

# Feature Importance (Optional but insightful)
importances_large = rf_model_large.feature_importances_
feature_names_large = X_large.columns
feature_importance_df_large = pd.DataFrame({'feature': feature_names_large, 'importance': importances_large})
feature_importance_df_large = feature_importance_df_large.sort_values('importance', ascending=False)

print("\nFeature Importance (Top 10):")
print(feature_importance_df_large.head(10)) # Print top 10 features

# Making Predictions on New Data (Example using the trained large model)
# Suppose you have a new customer's data in a DataFrame
new_customer_data_example = pd.DataFrame({
    'Age': [28],
    'Gender': ['Male'],
    'Tenure': [8],
    'MonthlyCharges': [105.00],
    'TotalCharges': [840.00],
    'ContractType': ['Month-to-month'],
    'InternetService': ['Fiber optic'],
    'PaymentMethod': ['Electronic check']
})

# Preprocess the new data (must match the training data preprocessing)
# Need to apply the same one-hot encoding and ensure column order
new_customer_encoded_example = pd.get_dummies(new_customer_data_example,
                                                columns=['Gender', 'ContractType', 'InternetService', 'PaymentMethod'],
                                                drop_first=True)

# Ensure columns match the training data (add missing columns with 0)
missing_cols_example = set(X_train_large.columns) - set(new_customer_encoded_example.columns)
for c in missing_cols_example:
    new_customer_encoded_example[c] = 0

# Ensure the order of columns is the same
new_customer_encoded_example = new_customer_encoded_example[X_train_large.columns]

# Make prediction using the trained large model
new_customer_prediction_large = rf_model_large.predict(new_customer_encoded_example)
new_customer_churn_proba_large = rf_model_large.predict_proba(new_customer_encoded_example)[:, 1]

print(f"\nPrediction for example new customer: {'Churn' if new_customer_prediction_large[0] == 1 else 'No Churn'}")
print(f"Probability of Churn: {new_customer_churn_proba_large[0]:.4f}")



Generated dataset with 10000 rows.
Dataset head:
   CustomerID  Age  Gender  Tenure  MonthlyCharges  TotalCharges  \
0           1   56  Female       3           16.16       1005.13   
1           2   69    Male       1           64.93       5252.00   
2           3   46    Male       0           50.29       3918.32   
3           4   32  Female      64           85.86       6078.74   
4           5   60    Male      45           82.03       3434.14   

     ContractType InternetService              PaymentMethod  Churn  
0        Two year              No           Electronic check      0  
1        Two year     Fiber optic  Bank transfer (automatic)      0  
2  Month-to-month     Fiber optic  Bank transfer (automatic)      0  
3  Month-to-month             DSL           Electronic check      1  
4        Two year             DSL    Credit card (automatic)      0  

Churn Distribution:
Churn
0    0.6653
1    0.3347
Name: proportion, dtype: float64

Training set size: 7500
Testing set s