In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import joblib


In [4]:
df = pd.read_csv('data.csv')
column_names = [
    "age", "workclass", "fnlwgt", "education", "education-num", "marital-status", 
    "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", 
    "hours-per-week", "native-country", "income"
]

In [6]:
# Read the dataset into a pandas DataFrame
data = pd.read_csv('data.csv', names=column_names, engine="python")

# Display the first few rows of the dataset
print(data.head())

# Preprocessing: Encoding the target variable
data['income'] = data['income'].apply(lambda x: 1 if x == ">50K" else 0)

# Handle missing values (if any)
data = data.dropna()

# Splitting the data into features (X) and target variable (y)
X = data.drop('income', axis=1)
y = data['income']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display shapes of the split data
print(f"Training data size: {X_train.shape}, Test data size: {X_test.shape}")

   age          workclass  fnlwgt   education  education-num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital-status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital-gain  capital-loss  hours-per-week  native-country  income  
0          2174             0              40   United-States   <=50

In [8]:
# Phase 3: Model Building and Hyperparameter Tuning
# Initialize the Random Forest classifier
rf_model = RandomForestClassifier(random_state=42)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True, False]
}

In [None]:
# Grid search with cross-validation
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Best parameters and model after tuning
print("Best hyperparameters:", grid_search.best_params_)

# Retrain the best model on the training data
best_rf_model = grid_search.best_estimator_

# Evaluate the model
y_pred = best_rf_model.predict(X_test)

# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Phase 4: Finalizing the Model and Making Predictions
# Save the model to a file
joblib.dump(best_rf_model, 'random_forest_model.pkl')

# Make predictions on the test set
predictions = best_rf_model.predict(X_test)

# Accuracy of the model
accuracy = accuracy_score(y_test, predictions)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, predictions)
print("Confusion Matrix:")
print(conf_matrix)

# Load the saved model (if needed later)
loaded_model = joblib.load('random_forest_model.pkl')
