In [73]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd 

# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

# Feature engineering
from sklearn.preprocessing import PolynomialFeatures

# File system manangement
import os

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns

In [74]:
train = pd.read_csv("Dataset/train.csv")
test = pd.read_csv("Dataset/test.csv")
train = train.drop("Unnamed: 0", axis=1)
test = test.drop("id", axis=1)

In [75]:
# Function to calculate missing values by column# Funct 
def missing_values_table(df, info=True):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        if info:
            # Print some summary information
            print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
                "There are " + str(mis_val_table_ren_columns.shape[0]) +
                " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

In [76]:
missing_values_table(train)

Your selected dataframe has 24 columns.
There are 22 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
technical_solution_proposed,676014,93.5
type_of_roof,421208,58.3
plinth_area (ft^2),421208,58.3
type_of_ground_floor,332806,46.0
floors_before_eq (total),332806,46.0
height_before_eq (ft),332806,46.0
position,312006,43.2
type_of_other_floor,301606,41.7
building_plan_configuration,301606,41.7
land_surface_condition,301606,41.7


In [77]:
missing_values_table(test)

Your selected dataframe has 23 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


In [78]:
mapping = {
    'floor two': 2,
    'Floor 3': 3,
    'Two Floor': 2,
    'two': 2,
    ' just 2 floor': 2,
    'There is 2 Floor/Story': 2,
    '1st Story': 1,
    'Floor 2': 2,
    'fifth': 5,
    'Three floor': 3,
    'floor one': 1,
    '2': 2,
    'Has 1 floor': 1,
    'one story': 1,
    'two story': 2,
    'floor second': 2,
    'floor third': 3,
    '3.00': 3,
    '1': 1,
    ' has 3 Floor': 3,
    '2 floor': 2,
    'three Story': 3,
    'Floor 1': 1,
    'floor 1st': 1,
    'one': 1,
    'Just 1 floor': 1,
    'Floor-three': 3,
    'Three': 3,
    'four Floor': 4,
    ' has Four fl': 4,
    'Fl four': 4,
    'Fl Five': 5,
    '6': 6,
    '5': 5,
    '4': 4,
    'Floor 4': 4,
    'floor four': 4,
    'Four Story': 4,
    'Four': 4,
    'Floor Fifth': 5,
    'five Floor': 5,
    ' Has Five fl': 5,
    '8': 8,
    '9': 9,
    '7': 7,
    'Has Five fl': 5,
}

# Apply the mapping to convert text values to numerical values
train["floors_before_eq (total)"] = train["floors_before_eq (total)"].map(mapping)
test["floors_before_eq (total)"] = test["floors_before_eq (total)"].map(mapping)

In [79]:
train["plinth_area (ft^2)"] = train["plinth_area (ft^2)"].replace('More than 1000 ft^2', 1005)
test["plinth_area (ft^2)"] = test["plinth_area (ft^2)"].replace('More than 1000 ft^2', 1005)


train["plinth_area (ft^2)"] = train["plinth_area (ft^2)"].str.replace(' ft^2', '', regex=False).astype(float).astype('Int64')
test["plinth_area (ft^2)"] = test["plinth_area (ft^2)"].str.replace(' ft^2', '', regex=False).astype(float).astype('Int64')

In [80]:
train["no_family_residing"] = train["no_family_residing"].replace('None', 0).astype('Float32')
test["no_family_residing"] = test["no_family_residing"].replace('None', 0).astype('Float32')

In [81]:
train["type_of_reinforcement_concrete"] = train["type_of_reinforcement_concrete"].astype('Int16')
test["type_of_reinforcement_concrete"] = test["type_of_reinforcement_concrete"].astype('Int16')

train["wall_binding"] = train["wall_binding"].astype('Int16')
test["wall_binding"] = test["wall_binding"].astype('Int16')

train["wall_material"] = train["wall_material"].astype('Int16')
test["wall_material"] = test["wall_material"].astype('Int16')

In [82]:
# Gets the categorical and numerical columns
categorical = [c for c in train.columns 
               if train.dtypes[c] == 'object']

numerical = [c for c in train.columns 
             if train.dtypes[c] != 'object']

# Correct these columns
numerical.remove("wall_binding")
numerical.remove("wall_material")
numerical.remove("type_of_reinforcement_concrete")

categorical.append("wall_binding")
categorical.append("wall_material")
categorical.append("type_of_reinforcement_concrete")

In [83]:
train[numerical].head()

Unnamed: 0,floors_before_eq (total),old_building,plinth_area (ft^2),height_before_eq (ft),has_secondary_use,no_family_residing,damage_grade
0,2.0,1.0,256.0,22.0,0.0,1.0,1.0
1,3.0,3.0,985.0,18.0,0.0,1.0,5.0
2,2.0,7.0,,14.0,0.0,1.0,5.0
3,2.0,18.0,185.0,15.0,0.0,1.0,4.0
4,2.0,22.0,290.0,17.0,0.0,1.0,1.0


In [91]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the column using LabelEncoder
train["flexible_superstructure"] = label_encoder.fit_transform(train["flexible_superstructure"])
test["flexible_superstructure"] = label_encoder.transform(test["flexible_superstructure"])

In [92]:
train = pd.get_dummies(train, columns=categorical)
test = pd.get_dummies(test, columns=categorical)

In [93]:
y = train["damage_grade"]
train, test = train.align(test, join='inner', axis=1)
train["damage_grade"] = y

print('Training shape: ', train.shape)
print('Testing shape: ', test.shape)

Training shape:  (722815, 138)
Testing shape:  (242082, 137)


In [95]:
train.to_csv("Dataset/train_le_ohe.csv", index=False)
test.to_csv("Dataset/test_le_ohe.csv", index=False)