<a href="https://colab.research.google.com/github/Kpk48/Game_CCP/blob/main/SMOTE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
!pip install pandas
!pip install numpy
!pip install scikit-learn
!pip install imblearn
!pip install joblib
!pip install xgboost
!pip install matplotlib
!pip install seaborn


Collecting xgboost
  Downloading xgboost-3.0.0-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.26.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-3.0.0-py3-none-manylinux_2_28_x86_64.whl (253.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.9/253.9 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nvidia_nccl_cu12-2.26.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (318.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.1/318.1 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost
Successfully installed nvidia-nccl-cu12-2.26.5 xgboost-3.0.0


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load training and test data
df_train = pd.read_csv('/customer_churn_dataset-training-master.csv')
df_test = pd.read_csv('/customer_churn_dataset-testing-master.csv')

In [11]:


# Drop rows with missing target in training set
df_train = df_train.dropna(subset=['Churn']).reset_index(drop=True)

# Separate features and target
y_train = df_train['Churn']
X_train = df_train.drop(columns=['Churn'])
X_test = df_test.copy()

# Optionally drop identifier column if present
if 'customerID' in X_train.columns:
    X_train = X_train.drop(columns=['customerID'])
    X_test = X_test.drop(columns=['customerID'])

# Identify categorical and numerical columns
cat_cols = X_train.select_dtypes(include=['object']).columns.tolist()
num_cols = X_train.select_dtypes(include=['number']).columns.tolist()

# Define preprocessing pipeline
target_preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

# Define model candidates and parameter grids
models = {
    'RandomForest': (RandomForestClassifier(random_state=42), {
        'clf__n_estimators': [100, 200],
        'clf__max_depth': [None, 10, 20]
    }),
    'XGBoost': (XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42), {
        'clf__n_estimators': [100, 200],
        'clf__max_depth': [3, 6]
    }),
    'LogisticRegression': (LogisticRegression(max_iter=1000, random_state=42), {
        'clf__C': [0.01, 0.1, 1, 10]
    })
}

best_score = 0
best_model = None

# Split train into train/validation for evaluation
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

for name, (estimator, params) in models.items():
    pipe = Pipeline([
        ('pre', target_preprocessor),
        ('clf', estimator)
    ])
    grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy', n_jobs=-1)
    grid.fit(X_tr, y_tr)
    val_preds = grid.predict(X_val)
    score = accuracy_score(y_val, val_preds)
    print(f"{name} validation accuracy: {score:.4f}")
    if score > best_score:
        best_score = score
        best_model = grid.best_estimator_

print(f"\nBest model: {best_model.named_steps['clf'].__class__.__name__} with accuracy {best_score:.4f}")

# Retrain best model on full training data
best_model.fit(X_train, y_train)

# Predict on test set and save results
test_preds = best_model.predict(X_test)
# If we dropped customerID, reload it from test file
customer_ids = df_test['customerID'] if 'customerID' in df_test.columns else pd.Series(range(len(test_preds)))
output = pd.DataFrame({'customerID': customer_ids, 'Churn': test_preds})
output.to_csv('churn_predictions.csv', index=False)
print("Saved predictions to 'churn_predictions.csv'")


RandomForest validation accuracy: 0.9997


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost validation accuracy: 0.9999
LogisticRegression validation accuracy: 0.9808

Best model: XGBClassifier with accuracy 0.9999


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Saved predictions to 'churn_predictions.csv'
