### Assignment-4

**Objective:**

Understand and implement model evaluation using cross-validation and improve model performance by hyperparameter tuning.

Step 1: Import Libraries and Load Data

In [1]:
# Step 1: Import Libraries and Load Data
import pandas as pd
import numpy as np

# ML tools
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns


Step 2: Load Dataset and Prepare Features and Target

In [7]:
# Step 2: Load Dataset and Prepare Features and Target

df = pd.read_csv("preprocessed_earthquake_data.csv")

print("First 5 rows:")
print(df.head())

# Drop missing values (optional, depends on dataset)
df = df.dropna()

# Define Features (X) and Target (y)
X = df.drop("Status_Reviewed", axis=1)  
y = df["Status_Reviewed"]

print("Features shape:", X.shape)
print("Target shape:", y.shape)



First 5 rows:
   Latitude  Longitude        Type     Depth  Magnitude Magnitude Type  \
0  0.583377   0.844368  Earthquake  0.495984   0.277668             MW   
1  0.006109   0.698849  Earthquake  0.075272  -0.195082             MW   
2 -0.739162  -1.701962  Earthquake -0.413928   0.750418             MW   
3 -2.017599  -0.503524  Earthquake -0.454694  -0.195082             MW   
4  0.340688   0.691479  Earthquake -0.454694  -0.195082             MW   

   Root Mean Square  Source     Status      Year  ...  Source_ISCGEM  \
0         -0.103839  ISCGEM  Automatic -1.915523  ...            1.0   
1         -0.103839  ISCGEM  Automatic -1.915523  ...            1.0   
2         -0.103839  ISCGEM  Automatic -1.915523  ...            1.0   
3         -0.103839  ISCGEM  Automatic -1.915523  ...            1.0   
4         -0.103839  ISCGEM  Automatic -1.915523  ...            1.0   

   Source_ISCGEMSUP  Source_NC  Source_NN  Source_OFFICIAL  Source_PR  \
0               0.0        0.0     

Step 3: Implement Cross-Validation

In [9]:
from sklearn.preprocessing import LabelEncoder

# Make a copy to avoid modifying original
df_encoded = df.copy()

# Encode categorical columns automatically
label_encoders = {}
for col in df_encoded.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])
    label_encoders[col] = le

# Now define X and y (example: predict Status_Reviewed)
X = df_encoded.drop("Status_Reviewed", axis=1)
y = df_encoded["Status_Reviewed"]

print("Encoded features shape:", X.shape)
print("Target distribution:\n", y.value_counts())


Encoded features shape: (23409, 39)
Target distribution:
 Status_Reviewed
1.0    20770
0.0     2639
Name: count, dtype: int64


In [10]:
# Step 3: Implement Cross-Validation

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
import numpy as np

rf = RandomForestClassifier(random_state=42)

cv_scores = cross_val_score(rf, X, y, cv=5)

print("Cross-validation scores:", cv_scores)
print("Mean CV Accuracy:", np.mean(cv_scores))



Cross-validation scores: [1.         1.         1.         1.         0.98782311]
Mean CV Accuracy: 0.9975646229438155


Step 4: Hyperparameter Tuning with GridSearchCV

In [11]:
# Step 4: Hyperparameter Tuning with GridSearchCV

# Define parameter grid for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

# GridSearchCV
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

# Fit on the data
grid_search.fit(X, y)

print("Best Parameters:", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)


Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}
Best CV Score: 0.9976500747703483


Step 5: Evaluate Best Model on Full Dataset

In [12]:
# Step 5: Evaluate Best Model on Full Dataset

best_model = grid_search.best_estimator_

# Train on the full dataset
best_model.fit(X, y)

# Predict on the same dataset (for demonstration)
y_pred = best_model.predict(X)

print("Accuracy on full dataset:", accuracy_score(y, y_pred))
print("\nClassification Report:\n", classification_report(y, y_pred))


Accuracy on full dataset: 1.0

Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      2639
         1.0       1.00      1.00      1.00     20770

    accuracy                           1.00     23409
   macro avg       1.00      1.00      1.00     23409
weighted avg       1.00      1.00      1.00     23409

