<a href="https://colab.research.google.com/github/Manikantaamanchi424/Infosys-Intern-Project/blob/main/Implimentation_of_cross_validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Implementation of Cross-Validation & Hyperparameter Tuning

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression

In [None]:
# Load the dataset
data = pd.read_csv('/content/preprocessed_earthquake_data.csv')

# Define the features and target
target = 'Status_Reviewed'
categorical_cols = ['Type', 'Magnitude Type', 'Source', 'Status']

X = data.drop(columns=[target] + categorical_cols)
y = data[target]


In [None]:
# Define model
models = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'LogisticRegression': LogisticRegression(random_state=42)
}

In [None]:
for name, model in models.items():
    print(f"\nCross-validation for {name}:")
    scores = cross_val_score(model, X, y, cv=10, scoring='accuracy')
    print(f"Accuracy Scores for each fold: {scores}")
    print(f"Mean Accuracy: {np.mean(scores):.4f}")
    print(f"Accuracy Variance: {np.var(scores):.6f}")



Cross-validation for RandomForest:
Accuracy Scores for each fold: [0.9987185  1.         1.         1.         1.         1.
 1.         1.         1.         0.97350427]
Mean Accuracy: 0.9972
Accuracy Variance: 0.000063

Cross-validation for LogisticRegression:
Accuracy Scores for each fold: [0.99700982 1.         1.         1.         1.         1.
 1.         1.         1.         0.97521368]
Mean Accuracy: 0.9972
Accuracy Variance: 0.000055


In [None]:
# Hyperparameter Tuning and Model Selection

from sklearn.model_selection import GridSearchCV

# Parameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

# Parameter grid for Logistic Regression
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']  # solver supporting l1 penalty
}

# GridSearchCV for each model with stratified 5-fold CV
grid_searches = {
    'RandomForest': GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=5, scoring='accuracy'),
    'LogisticRegression': GridSearchCV(LogisticRegression(max_iter=500, random_state=42), param_grid_lr, cv=5, scoring='accuracy')
}

# Fit grid search and find best params and scores
for name, gs in grid_searches.items():
    gs.fit(X, y)
    print(f"\nBest parameters for {name}: {gs.best_params_}")
    print(f"Best cross-validation accuracy for {name}: {gs.best_score_:.4f}")



Best parameters for RandomForest: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}
Best cross-validation accuracy for RandomForest: 0.9974

Best parameters for LogisticRegression: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
Best cross-validation accuracy for LogisticRegression: 0.9976


In [None]:
## XGBRegressor vs Linear Regression

# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor



# Define features and target
target = 'Status_Reviewed'  # Ensure numeric for regression
categorical_cols = ['Type', 'Magnitude Type', 'Source', 'Status']
X = data.drop(columns=[target] + categorical_cols)
y = data[target]

# Split dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = {
    'XGBRegressor': XGBRegressor(random_state=42),
    'LinearRegression': LinearRegression()
}

# Train models and evaluate
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"\n{name} Evaluation:")
    print(f"MAE: {mae:.4f}")
    print(f"MSE: {mse:.4f}")
    print(f"R² Score: {r2:.4f}")

# Optional: cross-validation R²
from sklearn.model_selection import cross_val_score
for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=10, scoring='r2')
    print(f"\n{name} 10-fold CV R²: {scores}")
    print(f"Mean R²: {np.mean(scores):.4f}, Variance: {np.var(scores):.6f}")



XGBRegressor Evaluation:
MAE: 0.0000
MSE: 0.0000
R² Score: 1.0000

LinearRegression Evaluation:
MAE: 0.0003
MSE: 0.0000
R² Score: 1.0000

XGBRegressor 10-fold CV R²: [ 0.02811017 -8.47108451  0.99999996  0.99999977  0.99999999  0.99999999
  0.99999865  0.99999989  0.99999987  0.        ]
Mean R²: -0.1443, Variance: 7.855184

LinearRegression 10-fold CV R²: [0.899571   0.99999807 0.99999406 0.99998323 0.99999177 0.99998634
 0.99589354 0.95998074 0.97207986 0.        ]
Mean R²: 0.8827, Variance: 0.087500
