### 1. Importing Libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from scipy.stats import randint, uniform


### 2. Loading and Preprocessing Data

In [None]:
train_data = pd.read_csv("train.csv")
train_data = train_data.drop(columns=['CustomerId', 'Surname'])


X_train = train_data.drop(columns=['Exited'])
y_train = train_data['Exited']


In [None]:
train_data.head(2)

### 3. Preprocessing Steps

In [None]:
numeric_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
categorical_features = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember']

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


### 4. Hyperparameter Tuning for RandomForest

In [None]:
param_dist_rf = {
    'classifier__n_estimators': randint(50, 200),
    'classifier__max_depth': [None, 10, 20, 30, 40, 50],
    'classifier__min_samples_split': randint(2, 20),
}

rf_model = RandomForestClassifier(random_state=42)
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                ('classifier', rf_model)])

rf_random = RandomizedSearchCV(rf_pipeline, param_distributions=param_dist_rf, n_iter=10, cv=5, scoring='accuracy', random_state=42)
rf_random.fit(X_train, y_train)
best_rf_model = rf_random.best_estimator_


### 5. Hyperparameter Tuning for XGBoost

In [None]:
param_dist_xgb = {
    'classifier__n_estimators': randint(50, 200),
    'classifier__max_depth': [3, 5, 7, 9],
    'classifier__learning_rate': uniform(0.01, 0.2),
}

xgb_model = XGBClassifier(random_state=42)
xgb_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('classifier', xgb_model)])

xgb_random = RandomizedSearchCV(xgb_pipeline, param_distributions=param_dist_xgb, n_iter=10, cv=5, scoring='accuracy', random_state=42)
xgb_random.fit(X_train, y_train)
best_xgb_model = xgb_random.best_estimator_


### 6. Evaluating Models

In [None]:
models = [
    ('RandomForest', best_rf_model),
    ('XGBoost', best_xgb_model)
]

for name, model in models:
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    print(f'{name} - Accuracy: {scores.mean()} (Std: {scores.std()})')


In [None]:
best_model = best_xgb_model['classifier']  # Access the classifier part of the pipeline
full_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('classifier', best_model)])
full_pipeline.fit(X_train, y_train)

import joblib
# Save the trained model using joblib
joblib.dump(full_pipeline, 'trained_model.joblib')

In [None]:
import pandas as pd
import joblib


test_data = pd.read_csv("test.csv")  


test_ids = test_data['id']


test_data = test_data.drop(columns=['CustomerId', 'Surname', 'id'])



X_test = test_data


loaded_model = joblib.load('trained_model.joblib')


test_probabilities = loaded_model.predict_proba(X_test)[:, 1]


test_result_df = pd.DataFrame({'id': test_ids, 'Exited': test_probabilities})


test_result_df.to_csv('sample_submission.csv', index=False)