In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from scipy.stats import chi2_contingency
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

In [3]:
building_damage_assessment = pd.read_csv("../csv_files/csv_building_damage_assessment.csv",low_memory=False)
name_mapping = pd.read_csv("../csv_files/ward_vdcmun_district_name_mapping.csv",low_memory=False)


In [4]:
def getNumUniqueValues(df, col):
    return df[col].nunique()

In [5]:
getNumUniqueValues(name_mapping, 'district_id')
getNumUniqueValues(building_damage_assessment, 'district_id')

11

In [6]:
# Create district_id_name_dict
district_id_name_dict = name_mapping.set_index('district_id')['district_name'].to_dict()

In [7]:
duplicates = name_mapping[name_mapping.duplicated(subset='vdcmun_id', keep=False)]
name_mapping = name_mapping.drop_duplicates(subset='vdcmun_id')
name_mapping = name_mapping.groupby('vdcmun_id').agg({
    'vdcmun_name': 'first',
    'district_name': 'first'
}).reset_index()
vdcmun_id_name_dict = name_mapping.set_index('vdcmun_id')[['vdcmun_name', 'district_name']].to_dict('index')


In [8]:
main_df_columns = building_damage_assessment.iloc[:, :54]

In [9]:
# Map vdcmun_id to vdcmun_name and district_name
main_df_columns['vdcmun_name'] = main_df_columns['vdcmun_id'].map(lambda x: vdcmun_id_name_dict[x]['vdcmun_name'])
main_df_columns['district_name'] = main_df_columns['vdcmun_id'].map(lambda x: vdcmun_id_name_dict[x]['district_name'])

# Reorder columns
cols = list(main_df_columns.columns)
cols.insert(cols.index('vdcmun_id'), cols.pop(cols.index('vdcmun_name')))
cols.insert(cols.index('district_id'), cols.pop(cols.index('district_name')))
main_df_columns = main_df_columns[cols]

In [10]:
def name_unique_values(df, col_number):
    print("Name of the column: ", df.columns[col_number])
    return df.iloc[:, col_number].unique()

In [11]:
target_column = 'damage_grade'
cramerV_list = []
processed_df = main_df_columns.iloc[:, :6]

for i in range(6, 55):
    entry_column = main_df_columns.columns[i]

    # Check for non-NaN and sufficient unique values
    if main_df_columns[entry_column].nunique() < 2:
        print(f"Skipping column {entry_column} due to insufficient unique values.")
        continue

    # Create contingency table
    contingency_table = pd.crosstab(main_df_columns[entry_column], main_df_columns[target_column])

    # Skip if the contingency table is empty
    if contingency_table.size == 0:
        print(f"Skipping column {entry_column} due to an empty contingency table.")
        continue

    # Perform chi-squared test
    chi2, p, dof, expected = chi2_contingency(contingency_table)

    # Calculate Cramér's V
    n = contingency_table.sum().sum()  # Total number of observations
    cramers_v = np.sqrt(chi2 / (n * (min(contingency_table.shape) - 1)))

    if cramers_v > 0.2:
        cramerV_list.append(cramers_v)
        print("Cramér's V:", cramers_v)
        print("Column name is:", entry_column)
        processed_df = pd.concat([processed_df, main_df_columns[[entry_column]]], axis=1)

# Add the target column to the processed DataFrame
processed_df = pd.concat([processed_df, main_df_columns[[target_column]]], axis=1)


Cramér's V: 0.5618371122498683
Column name is: damage_overall_collapse
Cramér's V: 0.2851951023763733
Column name is: damage_overall_leaning
Cramér's V: 0.25243262743853423
Column name is: damage_foundation_severe
Cramér's V: 0.22690109928414273
Column name is: damage_roof_severe
Cramér's V: 0.2520240002737159
Column name is: damage_corner_separation_severe
Cramér's V: 0.23537221388176963
Column name is: damage_diagonal_cracking_severe
Cramér's V: 0.24912822925831293
Column name is: damage_in_plane_failure_severe
Cramér's V: 0.23494034724518376
Column name is: damage_out_of_plane_failure_severe
Cramér's V: 0.21416783899299252
Column name is: damage_out_of_plane_failure_walls_ncfr_severe
Cramér's V: 0.22644795396018566
Column name is: damage_delamination_failure_severe
Cramér's V: 0.34345420751380434
Column name is: damage_column_failure_severe
Cramér's V: 0.45500037465583076
Column name is: damage_beam_failure_severe
Cramér's V: 0.3362346854327424
Column name is: damage_infill_partitio

In [12]:
processed_df.columns
clustering_df = processed_df.copy()
damage_grade_df = processed_df.copy()

In [13]:
df_damage = damage_grade_df.copy()

def drop_col_from_df(df, col):
    return df.drop(col, axis=1)

df_damage = drop_col_from_df(df_damage, 'building_id')
df_damage = drop_col_from_df(df_damage, 'district_name')
df_damage = drop_col_from_df(df_damage, 'district_id')
df_damage = drop_col_from_df(df_damage, 'vdcmun_name')
df_damage = drop_col_from_df(df_damage, 'vdcmun_id')
df_damage = drop_col_from_df(df_damage, 'ward_id')

In [14]:
df_damage = df_damage.dropna(subset=['damage_grade']) # drop rows with missing target values

In [15]:
# do label encodeing on all columns in df_damage
label_encoders = {}
for col in df_damage.columns:
    le = LabelEncoder()
    df_damage[col] = le.fit_transform(df_damage[col])
    label_encoders[col] = le

X = df_damage.drop('damage_grade', axis=1)
y = df_damage['damage_grade']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Step 3: Initialize the KNN model
print("Step 3: Initializing KNN model...")
knn_model = KNeighborsClassifier(n_neighbors=5)  # You can adjust n_neighbors as needed
print("KNN model initialized.\n")

# Step 4: Fit the model to the training data
print("Step 4: Fitting the KNN model...")
knn_model.fit(X_train, y_train)
print("Model fitting completed.\n")

# Step 5: Make predictions on the test set
print("Step 5: Making predictions on the test set...")
y_pred_knn = knn_model.predict(X_test)
print("Predictions completed.\n")

# Step 6: Evaluate the model
print("Step 6: Evaluating the model...")
accuracy_knn = accuracy_score(y_test, y_pred_knn)
report_knn = classification_report(y_test, y_pred_knn)
print(f"Accuracy on Test Set (KNN): {accuracy_knn}")
print("Classification Report (KNN):\n", report_knn)


Step 3: Initializing KNN model...
KNN model initialized.

Step 4: Fitting the KNN model...
Model fitting completed.

Step 5: Making predictions on the test set...
