In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Step 1: Load the train and test data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Step 2: Basic preprocessing
# Separate the target variable and features
X = train_df.drop(columns=['Default', 'LoanID'])  # Drop the target and LoanID
y = train_df['Default']  # Target variable

# Save LoanID from test set for final submission
test_df_loanID = test_df['LoanID']
X_test = test_df.drop(columns=['LoanID'])

# Handle categorical variables (if any) using Label Encoding
label_encoders = {}
for column in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X[column] = le.fit_transform(X[column].astype(str))
    X_test[column] = le.transform(X_test[column].astype(str))  # Match encoding in test set
    label_encoders[column] = le

# Split train data for training and validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Define the model and perform GridSearchCV for hyperparameter tuning
rf_model = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}

grid_search_rf = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search_rf.fit(X_train, y_train)

# Step 4: Evaluate the model on validation data
best_model = grid_search_rf.best_estimator_
y_val_pred = best_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {val_accuracy:.4f}")

# Step 5: Predict on the test set
predictions1 = best_model.predict(X_test)

# Step 6: Prepare the submission file
submission_df = pd.DataFrame({
    'LoanID': test_df_loanID,
    'Default': predictions1
})

submission_df.to_csv('submission.csv', index=False)
print("Submission file created as 'submission.csv'")
