In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load datasets
train_data = pd.read_csv('/kaggle/input/fds-assignment/trainfile.csv')
test_data = pd.read_csv('/kaggle/input/fds-assignment/testfile.csv')

# Handle missing values
train_data.fillna(train_data.mode().iloc[0], inplace=True)
test_data.fillna(test_data.mode().iloc[0], inplace=True)

# Encode categorical variables
categorical_columns = ['Geography', 'Gender', 'Occupation']
label_encoder = LabelEncoder()
for col in categorical_columns:
    train_data[col] = label_encoder.fit_transform(train_data[col])
    test_data[col] = label_encoder.transform(test_data[col])

# Scale numerical features
scaler = StandardScaler()
numerical_columns = ['CreditScore', 'Age', 'Tenure', 'Balance', 'EstimatedSalary']
train_data[numerical_columns] = scaler.fit_transform(train_data[numerical_columns])
test_data[numerical_columns] = scaler.transform(test_data[numerical_columns])

# Prepare features and target
X = train_data.drop(columns=['row ID', 'CustomerId', 'Surname', 'Exited'])
y = train_data['Exited']
X_test = test_data.drop(columns=['row ID', 'CustomerId', 'Surname'])

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


  train_data = pd.read_csv('/kaggle/input/fds-assignment/trainfile.csv')
  test_data = pd.read_csv('/kaggle/input/fds-assignment/testfile.csv')
  train_data.fillna(train_data.mode().iloc[0], inplace=True)
  test_data.fillna(test_data.mode().iloc[0], inplace=True)


In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# Iteration 1
rf_model = RandomForestClassifier(n_estimators=100, max_depth=6, random_state=42)
rf_model.fit(X_train, y_train)
y_val_pred = rf_model.predict(X_val)
print(f"Iteration 1 - Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(classification_report(y_val, y_val_pred))

test_predictions = rf_model.predict(X_test)
submission = pd.DataFrame({'row ID': test_data['row ID'], 'Exited': test_predictions})
submission.to_csv('randomforest_submission1.csv', index=False)

# Iteration 2
rf_model = RandomForestClassifier(n_estimators=150, max_depth=8, random_state=42, max_features='sqrt')
rf_model.fit(X_train, y_train)
y_val_pred = rf_model.predict(X_val)
print(f"Iteration 2 - Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(classification_report(y_val, y_val_pred))

test_predictions = rf_model.predict(X_test)
submission = pd.DataFrame({'row ID': test_data['row ID'], 'Exited': test_predictions})
submission.to_csv('randomforest_submission2.csv', index=False)

# Iteration 3
rf_model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42, min_samples_split=5)
rf_model.fit(X_train, y_train)
y_val_pred = rf_model.predict(X_val)
print(f"Iteration 3 - Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(classification_report(y_val, y_val_pred))

test_predictions = rf_model.predict(X_test)
submission = pd.DataFrame({'row ID': test_data['row ID'], 'Exited': test_predictions})
submission.to_csv('randomforest_submission3.csv', index=False)

# Iteration 4
rf_model = RandomForestClassifier(n_estimators=120, max_depth=7, random_state=42, min_samples_leaf=4)
rf_model.fit(X_train, y_train)
y_val_pred = rf_model.predict(X_val)
print(f"Iteration 4 - Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(classification_report(y_val, y_val_pred))

test_predictions = rf_model.predict(X_test)
submission = pd.DataFrame({'row ID': test_data['row ID'], 'Exited': test_predictions})
submission.to_csv('randomforest_submission4.csv', index=False)

# Iteration 5
rf_model = RandomForestClassifier(n_estimators=300, max_depth=None, random_state=42, max_features=None)
rf_model.fit(X_train, y_train)
y_val_pred = rf_model.predict(X_val)
print(f"Iteration 5 - Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(classification_report(y_val, y_val_pred))

test_predictions = rf_model.predict(X_test)
submission = pd.DataFrame({'row ID': test_data['row ID'], 'Exited': test_predictions})
submission.to_csv('randomforest_submission5.csv', index=False)


Iteration 1 - Validation Accuracy: 0.8568221164250163
              precision    recall  f1-score   support

           0       0.86      0.97      0.91     29189
           1       0.80      0.43      0.56      7779

    accuracy                           0.86     36968
   macro avg       0.83      0.70      0.74     36968
weighted avg       0.85      0.86      0.84     36968

Iteration 2 - Validation Accuracy: 0.8636929236096084
              precision    recall  f1-score   support

           0       0.88      0.96      0.92     29189
           1       0.78      0.49      0.60      7779

    accuracy                           0.86     36968
   macro avg       0.83      0.73      0.76     36968
weighted avg       0.86      0.86      0.85     36968

Iteration 3 - Validation Accuracy: 0.8678045877515689
              precision    recall  f1-score   support

           0       0.88      0.96      0.92     29189
           1       0.78      0.52      0.62      7779

    accuracy        

In [9]:
# Iteration 6
rf_model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42, min_samples_split=4, max_features='sqrt')
rf_model.fit(X_train, y_train)
y_val_pred = rf_model.predict(X_val)
print(f"Iteration 6 - Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(classification_report(y_val, y_val_pred))

test_predictions = rf_model.predict(X_test)
submission = pd.DataFrame({'row ID': test_data['row ID'], 'Exited': test_predictions})
submission.to_csv('randomforest_submission6.csv', index=False)

# Iteration 7
rf_model = RandomForestClassifier(n_estimators=250, max_depth=8, random_state=42, min_samples_split=3, max_features='log2')
rf_model.fit(X_train, y_train)
y_val_pred = rf_model.predict(X_val)
print(f"Iteration 7 - Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(classification_report(y_val, y_val_pred))

test_predictions = rf_model.predict(X_test)
submission = pd.DataFrame({'row ID': test_data['row ID'], 'Exited': test_predictions})
submission.to_csv('randomforest_submission7.csv', index=False)

# Iteration 8
rf_model = RandomForestClassifier(n_estimators=300, max_depth=12, random_state=42, min_samples_split=5, max_features=None)
rf_model.fit(X_train, y_train)
y_val_pred = rf_model.predict(X_val)
print(f"Iteration 8 - Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(classification_report(y_val, y_val_pred))

test_predictions = rf_model.predict(X_test)
submission = pd.DataFrame({'row ID': test_data['row ID'], 'Exited': test_predictions})
submission.to_csv('randomforest_submission8.csv', index=False)

# Iteration 9
rf_model = RandomForestClassifier(n_estimators=150, max_depth=6, random_state=42, min_samples_leaf=2)
rf_model.fit(X_train, y_train)
y_val_pred = rf_model.predict(X_val)
print(f"Iteration 9 - Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(classification_report(y_val, y_val_pred))

test_predictions = rf_model.predict(X_test)
submission = pd.DataFrame({'row ID': test_data['row ID'], 'Exited': test_predictions})
submission.to_csv('randomforest_submission9.csv', index=False)

# Iteration 10
rf_model = RandomForestClassifier(n_estimators=400, max_depth=10, random_state=42, max_features='sqrt', min_samples_split=2)
rf_model.fit(X_train, y_train)
y_val_pred = rf_model.predict(X_val)
print(f"Iteration 10 - Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(classification_report(y_val, y_val_pred))

test_predictions = rf_model.predict(X_test)
submission = pd.DataFrame({'row ID': test_data['row ID'], 'Exited': test_predictions})
submission.to_csv('randomforest_submission10.csv', index=False)

# Iteration 11
rf_model = RandomForestClassifier(n_estimators=100, max_depth=12, random_state=42, min_samples_leaf=3, max_features='log2')
rf_model.fit(X_train, y_train)
y_val_pred = rf_model.predict(X_val)
print(f"Iteration 11 - Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(classification_report(y_val, y_val_pred))

test_predictions = rf_model.predict(X_test)
submission = pd.DataFrame({'row ID': test_data['row ID'], 'Exited': test_predictions})
submission.to_csv('randomforest_submission11.csv', index=False)

# Iteration 12
rf_model = RandomForestClassifier(n_estimators=300, max_depth=None, random_state=42, min_samples_split=3, max_features=None)
rf_model.fit(X_train, y_train)
y_val_pred = rf_model.predict(X_val)
print(f"Iteration 12 - Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(classification_report(y_val, y_val_pred))

test_predictions = rf_model.predict(X_test)
submission = pd.DataFrame({'row ID': test_data['row ID'], 'Exited': test_predictions})
submission.to_csv('randomforest_submission12.csv', index=False)

# Iteration 13
rf_model = RandomForestClassifier(n_estimators=200, max_depth=8, random_state=42, max_features='sqrt', min_samples_leaf=4)
rf_model.fit(X_train, y_train)
y_val_pred = rf_model.predict(X_val)
print(f"Iteration 13 - Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(classification_report(y_val, y_val_pred))

test_predictions = rf_model.predict(X_test)
submission = pd.DataFrame({'row ID': test_data['row ID'], 'Exited': test_predictions})
submission.to_csv('randomforest_submission13.csv', index=False)

# Iteration 14
rf_model = RandomForestClassifier(n_estimators=350, max_depth=10, random_state=42, min_samples_split=5, max_features='log2')
rf_model.fit(X_train, y_train)
y_val_pred = rf_model.predict(X_val)
print(f"Iteration 14 - Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(classification_report(y_val, y_val_pred))

test_predictions = rf_model.predict(X_test)
submission = pd.DataFrame({'row ID': test_data['row ID'], 'Exited': test_predictions})
submission.to_csv('randomforest_submission14.csv', index=False)

# Iteration 15
rf_model = RandomForestClassifier(n_estimators=400, max_depth=None, random_state=42, min_samples_leaf=1, max_features=None)
rf_model.fit(X_train, y_train)
y_val_pred = rf_model.predict(X_val)
print(f"Iteration 15 - Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(classification_report(y_val, y_val_pred))

test_predictions = rf_model.predict(X_test)
submission = pd.DataFrame({'row ID': test_data['row ID'], 'Exited': test_predictions})
submission.to_csv('randomforest_submission15.csv', index=False)

Iteration 6 - Validation Accuracy: 0.8676422852196495
              precision    recall  f1-score   support

           0       0.88      0.96      0.92     29189
           1       0.78      0.52      0.62      7779

    accuracy                           0.87     36968
   macro avg       0.83      0.74      0.77     36968
weighted avg       0.86      0.87      0.86     36968

Iteration 7 - Validation Accuracy: 0.8637199740315948
              precision    recall  f1-score   support

           0       0.88      0.96      0.92     29189
           1       0.78      0.49      0.60      7779

    accuracy                           0.86     36968
   macro avg       0.83      0.73      0.76     36968
weighted avg       0.86      0.86      0.85     36968

Iteration 8 - Validation Accuracy: 0.8735392772127245
              precision    recall  f1-score   support

           0       0.90      0.95      0.92     29189
           1       0.76      0.58      0.66      7779

    accuracy        

In [10]:
# Iteration 31
rf_model = RandomForestClassifier(n_estimators=300, max_depth=7, random_state=42, min_samples_split=4, max_features='sqrt')
rf_model.fit(X_train, y_train)
y_val_pred = rf_model.predict(X_val)
print(f"Iteration 31 - Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(classification_report(y_val, y_val_pred))

test_predictions = rf_model.predict(X_test)
submission = pd.DataFrame({'row ID': test_data['row ID'], 'Exited': test_predictions})
submission.to_csv('randomforest_submission31.csv', index=False)

# Iteration 32
rf_model = RandomForestClassifier(n_estimators=400, max_depth=10, random_state=42, min_samples_split=3, max_features='log2')
rf_model.fit(X_train, y_train)
y_val_pred = rf_model.predict(X_val)
print(f"Iteration 32 - Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(classification_report(y_val, y_val_pred))

test_predictions = rf_model.predict(X_test)
submission = pd.DataFrame({'row ID': test_data['row ID'], 'Exited': test_predictions})
submission.to_csv('randomforest_submission32.csv', index=False)

# Iteration 33
rf_model = RandomForestClassifier(n_estimators=250, max_depth=8, random_state=42, min_samples_split=5, max_features=None)
rf_model.fit(X_train, y_train)
y_val_pred = rf_model.predict(X_val)
print(f"Iteration 33 - Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(classification_report(y_val, y_val_pred))

test_predictions = rf_model.predict(X_test)
submission = pd.DataFrame({'row ID': test_data['row ID'], 'Exited': test_predictions})
submission.to_csv('randomforest_submission33.csv', index=False)

# Iteration 34
rf_model = RandomForestClassifier(n_estimators=150, max_depth=6, random_state=42, min_samples_leaf=2)
rf_model.fit(X_train, y_train)
y_val_pred = rf_model.predict(X_val)
print(f"Iteration 34 - Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(classification_report(y_val, y_val_pred))

test_predictions = rf_model.predict(X_test)
submission = pd.DataFrame({'row ID': test_data['row ID'], 'Exited': test_predictions})
submission.to_csv('randomforest_submission34.csv', index=False)

# Iteration 35
rf_model = RandomForestClassifier(n_estimators=350, max_depth=None, random_state=42, max_features='sqrt', min_samples_split=2)
rf_model.fit(X_train, y_train)
y_val_pred = rf_model.predict(X_val)
print(f"Iteration 35 - Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(classification_report(y_val, y_val_pred))

test_predictions = rf_model.predict(X_test)
submission = pd.DataFrame({'row ID': test_data['row ID'], 'Exited': test_predictions})
submission.to_csv('randomforest_submission35.csv', index=False)

# Iteration 36
rf_model = RandomForestClassifier(n_estimators=120, max_depth=9, random_state=42, min_samples_leaf=3, max_features='log2')
rf_model.fit(X_train, y_train)
y_val_pred = rf_model.predict(X_val)
print(f"Iteration 36 - Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(classification_report(y_val, y_val_pred))

test_predictions = rf_model.predict(X_test)
submission = pd.DataFrame({'row ID': test_data['row ID'], 'Exited': test_predictions})
submission.to_csv('randomforest_submission36.csv', index=False)

# Iteration 37
rf_model = RandomForestClassifier(n_estimators=300, max_depth=10, random_state=42, min_samples_split=3, max_features=None)
rf_model.fit(X_train, y_train)
y_val_pred = rf_model.predict(X_val)
print(f"Iteration 37 - Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(classification_report(y_val, y_val_pred))

test_predictions = rf_model.predict(X_test)
submission = pd.DataFrame({'row ID': test_data['row ID'], 'Exited': test_predictions})
submission.to_csv('randomforest_submission37.csv', index=False)

# Iteration 38
rf_model = RandomForestClassifier(n_estimators=200, max_depth=8, random_state=42, max_features='sqrt', min_samples_leaf=4)
rf_model.fit(X_train, y_train)
y_val_pred = rf_model.predict(X_val)
print(f"Iteration 38 - Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(classification_report(y_val, y_val_pred))

test_predictions = rf_model.predict(X_test)
submission = pd.DataFrame({'row ID': test_data['row ID'], 'Exited': test_predictions})
submission.to_csv('randomforest_submission38.csv', index=False)

# Iteration 39
rf_model = RandomForestClassifier(n_estimators=350, max_depth=7, random_state=42, min_samples_split=5, max_features='log2')
rf_model.fit(X_train, y_train)
y_val_pred = rf_model.predict(X_val)
print(f"Iteration 39 - Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(classification_report(y_val, y_val_pred))

test_predictions = rf_model.predict(X_test)
submission = pd.DataFrame({'row ID': test_data['row ID'], 'Exited': test_predictions})
submission.to_csv('randomforest_submission39.csv', index=False)

# Iteration 40
rf_model = RandomForestClassifier(n_estimators=400, max_depth=None, random_state=42, min_samples_leaf=1, max_features=None)
rf_model.fit(X_train, y_train)
y_val_pred = rf_model.predict(X_val)
print(f"Iteration 40 - Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(classification_report(y_val, y_val_pred))

test_predictions = rf_model.predict(X_test)
submission = pd.DataFrame({'row ID': test_data['row ID'], 'Exited': test_predictions})
submission.to_csv('randomforest_submission40.csv', index=False)

# Iteration 41
rf_model = RandomForestClassifier(n_estimators=200, max_depth=12, random_state=42, max_features='sqrt', min_samples_leaf=3)
rf_model.fit(X_train, y_train)
y_val_pred = rf_model.predict(X_val)
print(f"Iteration 41 - Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(classification_report(y_val, y_val_pred))

test_predictions = rf_model.predict(X_test)
submission = pd.DataFrame({'row ID': test_data['row ID'], 'Exited': test_predictions})
submission.to_csv('randomforest_submission41.csv', index=False)

# Iteration 42
rf_model = RandomForestClassifier(n_estimators=350, max_depth=8, random_state=42, min_samples_split=3, max_features='sqrt')
rf_model.fit(X_train, y_train)
y_val_pred = rf_model.predict(X_val)
print(f"Iteration 42 - Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(classification_report(y_val, y_val_pred))

test_predictions = rf_model.predict(X_test)
submission = pd.DataFrame({'row ID': test_data['row ID'], 'Exited': test_predictions})
submission.to_csv('randomforest_submission42.csv', index=False)

# Iteration 43
rf_model = RandomForestClassifier(n_estimators=300, max_depth=7, random_state=42, min_samples_leaf=4, max_features=None)
rf_model.fit(X_train, y_train)
y_val_pred = rf_model.predict(X_val)
print(f"Iteration 43 - Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(classification_report(y_val, y_val_pred))

test_predictions = rf_model.predict(X_test)
submission = pd.DataFrame({'row ID': test_data['row ID'], 'Exited': test_predictions})
submission.to_csv('randomforest_submission43.csv', index=False)

# Iteration 44
rf_model = RandomForestClassifier(n_estimators=400, max_depth=9, random_state=42, max_features='log2', min_samples_split=4)
rf_model.fit(X_train, y_train)
y_val_pred = rf_model.predict(X_val)
print(f"Iteration 44 - Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(classification_report(y_val, y_val_pred))

test_predictions = rf_model.predict(X_test)
submission = pd.DataFrame({'row ID': test_data['row ID'], 'Exited': test_predictions})
submission.to_csv('randomforest_submission44.csv', index=False)

# Iteration 45
rf_model = RandomForestClassifier(n_estimators=450, max_depth=None, random_state=42, min_samples_split=2, max_features=None)
rf_model.fit(X_train, y_train)
y_val_pred = rf_model.predict(X_val)
print(f"Iteration 45 - Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(classification_report(y_val, y_val_pred))

test_predictions = rf_model.predict(X_test)
submission = pd.DataFrame({'row ID': test_data['row ID'], 'Exited': test_predictions})
submission.to_csv('randomforest_submission45.csv', index=False)


Iteration 31 - Validation Accuracy: 0.8602304695953257
              precision    recall  f1-score   support

           0       0.87      0.97      0.92     29189
           1       0.79      0.45      0.58      7779

    accuracy                           0.86     36968
   macro avg       0.83      0.71      0.75     36968
weighted avg       0.85      0.86      0.85     36968

Iteration 32 - Validation Accuracy: 0.8673717809997836
              precision    recall  f1-score   support

           0       0.88      0.96      0.92     29189
           1       0.78      0.52      0.62      7779

    accuracy                           0.87     36968
   macro avg       0.83      0.74      0.77     36968
weighted avg       0.86      0.87      0.86     36968

Iteration 33 - Validation Accuracy: 0.8687243020991128
              precision    recall  f1-score   support

           0       0.89      0.95      0.92     29189
           1       0.75      0.57      0.64      7779

    accuracy     