## Import the libraries

In [41]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import joblib
from scipy import stats
import re

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.feature_extraction.text import CountVectorizer

from lightgbm import LGBMClassifier

from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, BatchNormalization, Reshape, Concatenate
from tensorflow.keras.models import Model, load_model, save_model
from tensorflow.keras.models import Model
import tensorflow as tf
from keras import backend as K
import warnings
warnings.filterwarnings("ignore")

## Import the data

In [36]:
filepath = '/notebooks/EQ Damage Prediction Project/'

In [70]:
X_test, y_test = joblib.load(filepath+'test_data/test_data.pkl')
X_test.head(2)

Unnamed: 0,district_id,vdcmun_id,ward_id,count_floors_pre_eq,age_building,plinth_area_sq_ft,height_ft_pre_eq,land_surface_condition,foundation_type,roof_type,...,has_asset_cable_pre_eq,has_asset_computer_pre_eq,has_asset_internet_pre_eq,has_asset_telephone_pre_eq,has_asset_mobile_phone_pre_eq,has_asset_fridge_pre_eq,has_asset_motorcycle_pre_eq,has_asset_four_wheeler_family_use_pre_eq,has_asset_four_wheeler_commercial_use_pre_eq,has_asset_none_pre_eq
0,24,2409,240908,2,30,368,21,Moderate slope,Mud mortar-Stone/Brick,Bamboo/Timber-Light roof,...,0,0,0,0,1,0,0,0,0,0
1,30,3009,300902,3,25,300,30,Flat,Mud mortar-Stone/Brick,Bamboo/Timber-Heavy roof,...,0,0,0,0,1,0,0,0,0,0


## Predict Function

In [71]:
def predict(test_df):
    '''This function predicts the severity of damage given the raw input dataframe'''

    def preprocess_text(text):
        ''' This function preprocesses the categorical text feature '''

        text = str(text).lower()            
        text = re.sub(r'\.', '', text)       
        text = re.sub(r' |/|-', '_', text)   # Replace space or '/' or '-' chars by '_'
        return text
    
    def get_embedded_data(embedding_matrix, test_data):
        
        cat_embed_names = joblib.load(filepath+'cat_embed_names.pkl')
        test_embed = []  
        for feat in list(embedding_matrix.keys()):
            test_embed.append(embedding_matrix[feat][test_data[feat]])

        test_embed = pd.DataFrame(np.concatenate(test_embed, axis=1), columns = cat_embed_names)
        test_embed = pd.concat((test_embed, test_data[int_dtypes]), axis=1)

        return test_embed
    
    # Preprocessing numerical features
    int_dtypes = joblib.load(filepath+'num_feat.pkl')
    test_df[int_dtypes] = test_df[int_dtypes].astype(int)
    boxcox_fit_lambdas = joblib.load(filepath+'boxcox_fit_lambdas.pkl')
    
    num_preprocessing_features = ['age_building','plinth_area_sq_ft','height_ft_pre_eq',
                                'age_household_head','size_household']
    for feature in num_preprocessing_features:
        test_df[feature] = stats.boxcox(test_df[feature]+1, boxcox_fit_lambdas['lambda_'+feature])

    # Preprocessing categorical features
    object_dtypes = joblib.load(filepath+'cat_feat.pkl')       
    for feature in object_dtypes:
        test_df[feature] = test_df[feature].apply(preprocess_text)
        tokenizer = joblib.load(filepath+'tokenizers/'+feature+'_tokenizer.pkl')
        test_df[feature] = np.array(tokenizer.texts_to_sequences(test_df[feature])).ravel()
    
    #Embedding the categorical features
    embedding_matrix = joblib.load(filepath+'oversamp/embedding_matrix.pkl')
    test_embed = get_embedded_data(embedding_matrix, test_df)
    
       
    # Standard Scaling
    scaler = joblib.load(filepath+'oversamp/standard_scaler.pkl')
    test_embed_std = scaler.transform(test_embed)
    test_embed_std = pd.DataFrame(test_embed_std, columns=test_embed.columns)

    # Load best model and predict
    model = joblib.load(filepath+'/models/LGB_model.pkl')
    predictions = model.predict(test_embed_std)

    return predictions

In [72]:
pred = predict(X_test)
pd.DataFrame(pred).value_counts()

2    74541
0    24375
1    13153
dtype: int64

## Score Function

In [73]:
X_test, y_test = joblib.load(filepath+'test_data/test_data.pkl')

In [74]:
def f1_micro(test_df, target, print_classification_report=True):
    '''This function predicts the severity of damage and returns micro_f1 score of its prediction and prints classification report if True'''
    
    def preprocess_text(text):
        ''' This function preprocesses the categorical text feature '''

        text = str(text).lower()            
        text = re.sub(r'\.', '', text)       
        text = re.sub(r' |/|-', '_', text)   # Replace space or '/' or '-' chars by '_'
        return text
    
    def get_embedded_data(embedding_matrix, test_data):
        
        cat_embed_names = joblib.load(filepath+'cat_embed_names.pkl')
        test_embed = []  
        for feat in list(embedding_matrix.keys()):
            test_embed.append(embedding_matrix[feat][test_data[feat]])

        test_embed = pd.DataFrame(np.concatenate(test_embed, axis=1), columns = cat_embed_names)
        test_embed = pd.concat((test_embed, test_data[int_dtypes]), axis=1)

        return test_embed
    
    # Preprocessing numerical features
    int_dtypes = joblib.load(filepath+'num_feat.pkl')
    test_df[int_dtypes] = test_df[int_dtypes].astype(int)
    boxcox_fit_lambdas = joblib.load(filepath+'boxcox_fit_lambdas.pkl')
    
    num_preprocessing_features = ['age_building','plinth_area_sq_ft','height_ft_pre_eq',
                                'age_household_head','size_household']
    for feature in num_preprocessing_features:
        test_df[feature] = stats.boxcox(test_df[feature]+1, boxcox_fit_lambdas['lambda_'+feature])

    # Preprocessing categorical features
    object_dtypes = joblib.load(filepath+'cat_feat.pkl')       
    for feature in object_dtypes:
        test_df[feature] = test_df[feature].apply(preprocess_text)
        tokenizer = joblib.load(filepath+'tokenizers/'+feature+'_tokenizer.pkl')
        test_df[feature] = np.array(tokenizer.texts_to_sequences(test_df[feature])).ravel()
    
    #Embedding the categorical features
    embedding_matrix = joblib.load(filepath+'oversamp/embedding_matrix.pkl')
    test_embed = get_embedded_data(embedding_matrix, test_df)
    
       
    # Standard Scaling
    scaler = joblib.load(filepath+'oversamp/standard_scaler.pkl')
    test_embed_std = scaler.transform(test_embed)
    test_embed_std = pd.DataFrame(test_embed_std, columns=test_embed.columns)

    # Load best model and predict
    model = joblib.load(filepath+'/models/LGB_model.pkl')
    predictions = model.predict(test_embed_std)

    # Scoring
    target_mapping = {'Mild': 0, 'Moderate': 1, 'Severe': 2}
    target = target.map(target_mapping)
    score = f1_score(target, predictions, average='micro')
    
    if print_classification_report:
        print('\n\033[1mClassification report:\033[0m')
        print(classification_report(target, predictions))

    return score

In [75]:
f1_micro(X_test, y_test, print_classification_report=True)


[1mClassification report:[0m
              precision    recall  f1-score   support

           0       0.73      0.76      0.74     23584
           1       0.52      0.35      0.42     19825
           2       0.83      0.91      0.87     68660

    accuracy                           0.78    112069
   macro avg       0.70      0.67      0.68    112069
weighted avg       0.76      0.78      0.76    112069



0.7750760692073633