In [1]:
# Overview
# Step 0: Combine train set and comptition test set
# Step 1: Drop geo_level_2_id and geo_level_3_id features
# Step 2: Normalization
# Step 3: One-hot encode the categorical features
# Step 4: Separate train set and comptition test set
#         Save and drop feature: building id
# Step 5: Split the data into train, test data

In [2]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split

In [3]:
data_keep = pd.read_csv('data/train_values.csv')
label_keep = pd.read_csv('data/train_labels.csv')
test_values_keep = pd.read_csv('data/test_values.csv')

### Step 0: Combine train set and comptition test set

In [4]:
full_data_keep = data_keep.append(test_values_keep).set_index(keys = 'building_id')

### Step 1: Drop geo_level_2_id and geo_index_3_id

In [5]:
to_drop = ['geo_level_3_id', 'geo_level_2_id']

to_enc = ['geo_level_1_id', 'foundation_type', 'roof_type', 'ground_floor_type', 'other_floor_type',\
          'plan_configuration', 'legal_ownership_status', 'land_surface_condition', 'position']

num_col = ['count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage', 'count_families']

In [6]:
full_data_keep.drop(columns=to_drop, inplace=True)

### Step 2: Min-max scale both datasets (Normalization)

#### Purpose of Min-max scale: Reduce weight of numeric features(features with high magnitudes) in model training 

In [7]:
# Min-max scale the training data_keepsets
#Min-max scale test_values
full_data_keep[num_col] = full_data_keep[num_col].astype('float')
scaler = MinMaxScaler()
full_data_keep[num_col] = scaler.fit_transform(full_data_keep[num_col])

### Step 3: One hot encode the categorical features and preserve building id from test data

In [8]:
# Step 2: One hot encode the categorical features
full_data_keep = pd.get_dummies(full_data_keep, prefix=to_enc, columns=to_enc, dtype='bool')

### Step 4: Separate train set and comptition test set. Save and remove feature: building id

In [9]:
train_building_id = data_keep['building_id']
test_building_id = test_values_keep['building_id']

In [10]:
data_keep = full_data_keep.loc[train_building_id]
test_values_keep = full_data_keep.loc[test_building_id]

In [11]:
del(full_data_keep)

In [12]:
data_keep.reset_index(drop=True, inplace=True)
test_values_keep.reset_index(drop=True, inplace=True)

label_keep = label_keep['damage_grade']

### Step 5: Split train dataset into train, test
#### The test set is used to check how well the learning model generalises to data it has not seen before

In [13]:
#choose your test size=0.2
#Split both oversampled and non-oversampled data

train_x_keep, test_x_keep, train_y_keep, test_y_keep = train_test_split(data_keep, label_keep, test_size=0.2)