In [1]:
import pickle  # Importing pickle
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
# Load the dataset
dataset_path = "C:/Users/dell/Downloads/Checkpoint2/Financial_inclusion_dataset.csv"  # Adjust the path to match your dataset location
try:
    df = pd.read_csv(dataset_path)
    print("Dataset loaded successfully!")
except Exception as e:
    print(f"Error loading dataset: {e}")
    exit()
# Drop unnecessary columns

# Encode categorical features
categorical_columns = [
    "country",
    "location_type",  # Removed 'bank_account' from this list
    "cellphone_access",
    "gender_of_respondent",
    "relationship_with_head",
    "marital_status",
    "education_level",
    "job_type",
]
target_column = "bank_account"  # Assuming this is the target variable

# Encoding the target variable
le = LabelEncoder()
df[target_column] = le.fit_transform(df[target_column])

# Define feature matrix (X) and target vector (y)
X = df.drop(columns=[target_column, 'uniqueid'])  # Dropping 'uniqueid' here
y = df[target_column]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing for categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_columns),
    ],
    remainder="passthrough",  # Pass through numerical columns
)

# Define the Random Forest model
rf = RandomForestClassifier(random_state=42)

# Set up a pipeline
pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", rf)])

# Set up GridSearchCV parameters
param_grid = {
    "classifier__n_estimators": [100, 200, 300],
    "classifier__max_depth": [10, 20, None],
    "classifier__min_samples_split": [2, 5, 10],
}
grid_search = GridSearchCV(
    pipeline, param_grid, cv=3, scoring="accuracy", verbose=3, n_jobs=-1
)

# Fit the model
grid_search.fit(X_train, y_train)

# Evaluate the model
y_pred = grid_search.best_estimator_.predict(X_test)
print("Best Parameters:", grid_search.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Save the model for Streamlit using pickle
with open("rf_model.pkl", "wb") as file:
    pickle.dump(grid_search.best_estimator_, file)
print("Model saved as rf_model.pkl")


Dataset loaded successfully!
Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best Parameters: {'classifier__max_depth': 10, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 200}
Accuracy: 0.8913921360255048
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.98      0.94      4063
           1       0.75      0.30      0.43       642

    accuracy                           0.89      4705
   macro avg       0.83      0.64      0.69      4705
weighted avg       0.88      0.89      0.87      4705

Model saved as rf_model.pkl
