In [None]:
# Overview
# Step 0: Combine train set and comptition test set
# Step 1: Drop unimportant data
# Step 2: Normalization
# Step 3: One-hot encode the categorical features
# Step 4: Separate train set and comptition test set
#         Save and drop feature: building id
# Step 5: Split the data into train, test data
# Step 6: Copy and oversample train dataset

In [None]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv('data/train_values.csv')
label = pd.read_csv('data/train_labels.csv')
test_values = pd.read_csv('data/test_values.csv')

### Step 0: Combine train set and comptition test set

In [None]:
full_data = data.append(test_values).set_index(keys = 'building_id')

### Step 1: Drop nothing

#### Purpose for dropping:
geo_level_2_id : Reduce computational cost

geo_level_3_id : Reduce computational cost

has_superstructure_adobe_mud : Unimportant feature 

has_superstructure_mud_mortar_brick : Unimportant feature 

has_superstructure_stone_flag: Unimportant feature

has_superstructure_timber : Unimportant feature

has_superstructure_bamboo : Unimportant feature

has_superstructure_other : Unimportant feature

has_secondary_use : Information of this feature is already represented by other has_secondary_use_xxx features

land_surface_condition : Unimportant feature

position : Unimportant feature

count_floors_pre_eq: Highly correlated with height_percentage

In [None]:
to_drop = ['geo_level_2_id', 'geo_level_3_id', 'has_superstructure_adobe_mud',\
           'has_superstructure_mud_mortar_brick', 'has_superstructure_stone_flag', 'has_superstructure_timber',\
           'has_superstructure_bamboo','has_superstructure_other',\
           'has_secondary_use', 'land_surface_condition', 'position','count_floors_pre_eq']

to_enc = ['geo_level_1_id', 'foundation_type', 'roof_type', 'ground_floor_type', 'other_floor_type',\
          'plan_configuration', 'legal_ownership_status']

num_col = ['age', 'area_percentage', 'height_percentage', 'count_families']

In [None]:
full_data.drop(columns=to_drop, inplace=True)

### Step 2: Min-max scale both datasets (Normalization)

#### Purpose of Min-max scale: Reduce weight of numeric features(features with high magnitudes) in model training 

In [None]:
# Min-max scale the training data_keepsets
#Min-max scale test_values
full_data[num_col] = full_data[num_col].astype('float')
scaler = MinMaxScaler()
full_data[num_col] = scaler.fit_transform(full_data[num_col])

### Step 3: One hot encode the categorical features and preserve building id from test data

In [None]:
# Step 2: One hot encode the categorical features
full_data = pd.get_dummies(full_data, prefix=to_enc, columns=to_enc, dtype='bool')

### Step 4: Separate train set and comptition test set. Save and remove feature: building id

In [None]:
train_building_id = data['building_id']
test_building_id = test_values['building_id']

In [None]:
data = full_data.loc[train_building_id]
test_values = full_data.loc[test_building_id]

In [None]:
data.reset_index(drop=True, inplace=True)
test_values.reset_index(drop=True, inplace=True)

label = label['damage_grade']

### Step 5: Split train dataset into train, test
#### The test set is used to check how well the learning model generalises to data it has not seen before

In [None]:
#choose your test size=0.2
#Split both oversampled and non-oversampled data

train_x, test_x, train_y, test_y = train_test_split(data, label, test_size=0.2)

### Step 6: Copy and oversample train dataset

In [None]:
ros = RandomOverSampler(random_state=0)
train_x_over, train_y_over = ros.fit_resample(train_x, train_y)
test_x_over = test_x.copy()
test_y_over = test_y.copy()