In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install skorch

In [None]:
# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

# General libraries
import random
import time
import string
from datetime import datetime
import itertools

# Data manipulation and analysis
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning and preprocessing
from scipy.sparse import hstack, csr_matrix
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, LassoLarsIC, Ridge, LogisticRegression
from sklearn.metrics import (confusion_matrix, roc_curve, auc, accuracy_score, 
                             f1_score, precision_score, recall_score, classification_report,
                             mean_squared_error, r2_score, mean_absolute_error)
from sklearn.model_selection import KFold, cross_val_score, train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, RobustScaler, StandardScaler, Normalizer
from sklearn.svm import LinearSVC
from sklearn.impute import SimpleImputer

# Specialized machine learning libraries
import xgboost as xgb
import lightgbm as lgb
from mlxtend.regressor import StackingRegressor, StackingCVRegressor

# Imbalanced learning
from imblearn.pipeline import Pipeline as imblearnPipeline
from imblearn.over_sampling import SMOTE

# Deep learning
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from skorch import NeuralNetClassifier

# Saving and loading models
import joblib
import pickle

# Natural language processing
import nltk

# Set plotting to inline mode
%matplotlib inline

In [None]:
train_values = pd.read_csv('/kaggle/input/clean-data/Clean Data/train_values.csv')
train_labels = pd.read_csv('/kaggle/input/clean-data/Clean Data/train_labels.csv')
test_values = pd.read_csv('/kaggle/input/clean-data/Clean Data/test_values.csv')
test_labels = pd.read_csv('/kaggle/input/clean-data/Clean Data/test_labels.csv')

train_values['Labels'] = train_labels['Labels']
test_values['Labels'] = test_labels['Labels']

# Combine train_values and test_values into one DataFrame
df = pd.concat([train_values, test_values], ignore_index=True)

In [None]:
df.head()

In [None]:
df.columns

In [None]:
# Plotting the distribution of floors count for pre and post earthquake
plt.figure(figsize=(14, 6))

# Pre-earthquake floors count distribution
plt.subplot(1, 2, 1)
plt.hist(df['count_floors_pre_eq'], bins=10, color='blue', alpha=0.7)
plt.title('Floors Count Distribution Pre-Earthquake')
plt.xlabel('Number of Floors')
plt.ylabel('Frequency')
plt.grid(axis='y', linestyle='--', alpha=0.7)  # Horizontal grid lines only

# Post-earthquake floors count distribution
plt.subplot(1, 2, 2)
plt.hist(df['count_floors_post_eq'], bins=10, color='red', alpha=0.7)
plt.title('Floors Count Distribution Post-Earthquake')
plt.xlabel('Number of Floors')
plt.ylabel('Frequency')
plt.grid(axis='y', linestyle='--', alpha=0.7)  # Horizontal grid lines only

plt.tight_layout()
plt.show()

In [None]:
# Separate features and target variable
X = df.drop(['height_ft_post_eq','count_floors_post_eq'], axis=1)
y = df['height_ft_post_eq']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print('Data in Train dataset:', len(X_train))
print('Data in Test dataset:', len(X_test))

## Preprocessing

In [None]:
# Define numeric and categorical features
num_feat = X.select_dtypes(include=['int64', 'float64']).columns
cat_feat = X.select_dtypes(include=['object']).columns

# Create the preprocessing steps for numeric features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=-999)),
    ('scaler', StandardScaler())])

# Create the preprocessing steps for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_feat),
        ('cat', categorical_transformer, cat_feat)])

## Logistic Regression

In [None]:
# Create a complete pipeline with SMOTE and Logistic Regression classifier
pipeline = imblearnPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),  # Handle class imbalance
    ('classifier', LogisticRegression(random_state=42))])

# Fit the model
pipeline.fit(X_train, y_train)

# Evaluate the model

train_score = pipeline.score(X_train, y_train)
test_score = pipeline.score(X_test, y_test)
y_pred = pipeline.predict(X_test)
f1 = f1_score(y_test, y_pred, average='micro')

print("Train score: %.3f" % train_score)
print("Test score: %.3f" % test_score)
print("F1 Score: %.3f" % f1)

In [None]:
def print_classification_report(y_true, y_pred):
    """
    Prints a classification report for given true and predicted outputs.
    
    Parameters:
        y_true (array-like): True labels.
        y_pred (array-like): Predicted labels.
    """
    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1_weighted = f1_score(y_true, y_pred, average='weighted')
    f1_micro = f1_score(y_true, y_pred, average='micro')  # Micro F1 Score
    
    # Print classification report from sklearn
    class_report = classification_report(y_true, y_pred, digits=2)
    print("Detailed Classification Report")
    print(class_report)
    
    # Print additional metrics
    print("Overall Metrics:")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score (Weighted): {f1_weighted:.2f}")
    print(f"F1 Score (Micro): {f1_micro:.2f}")  # Display Micro F1 Score
    
    # Print and plot confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    print("Confusion Matrix:")
    print(cm)
    
    # Plotting using seaborn
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap='Blues', xticklabels=True, yticklabels=True)
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.title('Confusion Matrix')
    plt.show()

## Evaluation

In [None]:
# Evaluating the model
print_classification_report(y_test, y_pred)

In [None]:
# Save the model
model_filename = 'count_floors_logistic_regression_84_model.joblib'
joblib.dump(pipeline, model_filename)

## Random Forest

In [None]:
pipeline = imblearnPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),  # Handle class imbalance
    ('classifier', RandomForestClassifier(class_weight='balanced', random_state=42))
])


# Fit the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Calculate scores
train_score = pipeline.score(X_train, y_train)
test_score = pipeline.score(X_test, y_test)
f1 = f1_score(y_test, y_pred, average='micro')

# Print scores
print("Train score: %.3f" % train_score)
print("Test score: %.3f" % test_score)
print("F1 Score: %.3f" % f1)

## Evaluation

In [None]:
# Evaluating the model
print_classification_report(y_test, y_pred)

In [None]:
# Save the model
model_filename = 'count_floors_random_forest_87__model.joblib'
joblib.dump(pipeline, model_filename)

## Deep Neural Netwrok (DNN)

In [None]:
# Preprocessing pipeline
preprocessing_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Apply preprocessing
X_processed = preprocessing_pipeline.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

In [None]:
class DNNClassifier(nn.Module):
    def __init__(self, input_size, hidden_size_1, hidden_size_2, output_size):
        super(DNNClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size_1)
        self.fc2 = nn.Linear(hidden_size_1, hidden_size_2)
        self.fc3 = nn.Linear(hidden_size_2, output_size)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        x = self.softmax(x)
        return x

# Convert them to PyTorch tensors and create DataLoader objects
train_dataset = TensorDataset(torch.Tensor(X_train), torch.LongTensor(y_train.values))
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

test_dataset = TensorDataset(torch.Tensor(X_test), torch.LongTensor(y_test.values))
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Define hyperparameters
input_size = X_train.shape[1]  # Number of input features
hidden_size_1 = 64  # Number of neurons in the first hidden layer
hidden_size_2 = 32  # Number of neurons in the second hidden layer
output_size = 10  # Number of classes (0 to 9 floors)

# Instantiate the DNN model
model = DNNClassifier(input_size, hidden_size_1, hidden_size_2, output_size)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()  # CrossEntropyLoss for multi-class classification
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {loss.item():.4f}")

## Evaluation

In [None]:
# Evaluate the model on the test set
model.eval()
correct_test = 0
total_test = 0
true_labels = []
predicted_labels = []
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total_test += labels.size(0)
        correct_test += (predicted == labels).sum().item()
        true_labels.extend(labels.numpy())
        predicted_labels.extend(predicted.numpy())

testing_accuracy = correct_test / total_test
print(f"Test Accuracy: {testing_accuracy:.4f}")

# Calculate and print detailed metrics for the test set
accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels, average='weighted')
recall = recall_score(true_labels, predicted_labels, average='weighted')
f1_weighted = f1_score(true_labels, predicted_labels, average='weighted')
f1_micro = f1_score(true_labels, predicted_labels, average='micro')

# Print classification report from sklearn
class_report = classification_report(true_labels, predicted_labels, digits=2)
print("Detailed Classification Report")
print(class_report)

# Print additional metrics
print("Overall Metrics:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score (Weighted): {f1_weighted:.2f}")
print(f"F1 Score (Micro): {f1_micro:.2f}")

# Print and plot confusion matrix
cm = confusion_matrix(true_labels, predicted_labels)
print("Confusion Matrix:")
print(cm)

# Plotting using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap='Blues', xticklabels=True, yticklabels=True)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()