In [9]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split,StratifiedShuffleSplit
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix
import xgboost
from sklearn.metrics import accuracy_score





In [2]:
df = pd.read_csv('data\Damage data.csv')
df.drop('Date', axis=1, inplace=True)
df = df.fillna(value={'Pillars':'None','Damage':'None'})

In [3]:
repair_bins = [-float('inf'),1, 20000, 40000, 60000, 80000, 100000, float('inf')]
value_bins = [0, 2000000, 4000000, 6000000, 8000000, 10000000, 12000000, 14000000, 16000000, 18000000, 20000000, float('inf')]
height_bins = [0, 15, 30, 45, 60, 75, 90, 105, 120, 135, 150, 165, 180, 195, 210, 225, 240, 255, 270, 285, 300, 315, 330, 345, 360, 375, 390, 405, 420, 435,float('inf')]

df['Estimated Repair value bins'] = pd.cut(df['Estimated Repair value'], bins=repair_bins, labels=False)
df['Estimated value bins'] = pd.cut(df['Estimated value'], bins=value_bins, labels=False)
df['Flood Height bins'] = pd.cut(df['Flood Height'], bins=height_bins, labels=False)

In [4]:
X_labels = ['Building Age', 'Estimated value bins', 'Floors', 'Building Height', 'Walls', 'Roof', 'Pillars', 'Flood Height']
y_label = ['Estimated Repair value bins']
categorical_cols = ['Walls', 'Roof', 'Pillars']

X = df[X_labels]
y = df[y_label]

In [5]:
encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(df[categorical_cols])

encoded_cols = encoder.get_feature_names_out(categorical_cols)
X_encoded_df = pd.DataFrame(X_encoded.toarray(), columns=encoded_cols)

X_numerical = X.drop(columns=categorical_cols)
X_final_df = pd.concat([X_encoded_df, X_numerical], axis=1)
X_final = X_final_df.values
y = np.squeeze(y)

# X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42)

stratSplit = StratifiedShuffleSplit(test_size=0.2, random_state=42)
for i, (train_index, test_index) in enumerate(stratSplit.split(X_final, y)):
    X_train, X_test = X_final[train_index], X_final[test_index] 
    y_train, y_test = y[train_index], y[test_index]



In [6]:
X_test_df = pd.DataFrame(X_test,columns=X_final_df.columns)
X_train_df = pd.DataFrame(X_train,columns=X_final_df.columns)

In [7]:
class_names=['0','1 - 20,000','20,000 - 40,000','40,000 - 60,000','60,000 - 80,000','80,000 - 100,000','100,000+' ]

In [10]:

xg_model = xgboost.XGBClassifier().fit(X_train_df, y_train.values)

xg_pred = xg_model.predict(X_test)

print('XGBoost accuracy:', round(accuracy_score(y_test, xg_pred), 3))

cm_xg = confusion_matrix(y_test, xg_pred)
print(cm_xg)



XGBoost accuracy: 0.864
[[ 2  0  0  0  0  0  0]
 [ 0 11  0  0  0  0  0]
 [ 0  2  2  0  0  0  0]
 [ 0  0  0  2  0  0  0]
 [ 0  0  0  0  1  0  0]
 [ 0  0  0  1  0  0  0]
 [ 0  0  0  0  0  0  1]]


In [11]:
joblib.dump(encoder, 'encoder.joblib')
# joblib.dump(xg_model_2, 'xg_model.joblib')


['encoder.joblib']

# Test


In [23]:
categorical_cols = ['Walls', 'Roof', 'Pillars']

def bin_estimated_value(estimated_value):
    value_bins = [0, 2000000, 4000000, 6000000, 8000000, 10000000,
                  12000000, 14000000, 16000000, 18000000, 20000000, float('inf')]
    return pd.cut([estimated_value], bins=value_bins).codes[0]


def get_estimated_value_range(value):
    repair_bins = [0, 20000, 40000, 60000,
                   80000, 100000]
    if value == 0:
        return "No damage"
    elif value == len(repair_bins):
        return f"{repair_bins[value-1]} or more"
    else:
        return f"{repair_bins[value-1]} - {repair_bins[value]}"


def bin_flood_height(height):
    height_bins = [0, 15, 30, 45, 60, 75, 90, 105, 120, 135, 150, 165, 180, 195, 210, 225,
                   240, 255, 270, 285, 300, 315, 330, 345, 360, 375, 390, 405, 420, 435, float('inf')]
    return pd.cut([height], bins=height_bins).codes[0]


input_data = {
        "Building Age": 5,
        "Estimated value": 3000000,
        "Floors":1,
        "Building Height":10,
        "Walls":"Block",
        "Roof":"Asbestos",
        "Pillars":"Concrete",
        "lat":6.9352056149376455,
        "lon":79.98591062850475

    }

flood_height = 53
damage_encoder = joblib.load('model/encoder.joblib')


In [24]:
input_df = pd.DataFrame({
        'Building Age': input_data['Building Age'],
        'Estimated value bins': bin_estimated_value(input_data['Estimated value']),
        'Floors': input_data['Floors'],
        'Building Height': input_data['Building Height'],
        'Walls': input_data['Walls'],
        'Roof': input_data['Roof'],
        'Pillars': input_data['Pillars'],
        'Flood Height bins': bin_flood_height(flood_height)
    }, index=[0])
    # Encode categorical columns in the input data


In [25]:
encoded_data = damage_encoder.transform(input_df[categorical_cols])
encoded_cols = damage_encoder.get_feature_names_out(categorical_cols)

In [27]:
encoded_data_df = pd.DataFrame(
        encoded_data.toarray(), columns=encoded_cols)

data_numerical = input_df.drop(columns=categorical_cols)
data_final = pd.concat([encoded_data_df, data_numerical], axis=1)

In [32]:
predictions = xg_model.predict(data_final.values)

final_predictions = get_estimated_value_range(predictions[0])

In [33]:
final_predictions

'No damage'