### Resources:
PCA - https://www.oreilly.com/library/view/hands-on-unsupervised-learning/9781492035633/ch04.html

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import DBSCAN

# Anomaly Detection 
    multivariate outliers (across many features)
- DBSCAN
- Isolation Forest
- PCA

## Load in data

In [2]:
train_df = pd.read_csv('P:\\ds-moni\\Competitions\\nepal_data\\train_values.csv')
train_labels = pd.read_csv('P:\\ds-moni\\Competitions\\nepal_data\\train_labels.csv')

In [3]:
train_df.set_index('building_id', inplace=True)

## Data Prep

**Feature engineering**

In [4]:
train_df['geo_level_1_id'] = pd.qcut(train_df['geo_level_1_id'], q=[0,0.25,0.5,0.75,1], labels=['1st_quartile','2nd_quartile','3rd_quartile','4th_quartile'])
train_df['geo_level_2_id'] = pd.qcut(train_df['geo_level_2_id'], q=[0,0.25,0.5,0.75,1], labels=['1st_quartile','2nd_quartile','3rd_quartile','4th_quartile'])
train_df['geo_level_3_id'] = pd.qcut(train_df['geo_level_3_id'], q=[0,0.25,0.5,0.75,1], labels=['1st_quartile','2nd_quartile','3rd_quartile','4th_quartile'])

**One Hote Encoding**

In [5]:
# a data frame with categoricals
numerical_vars = ['count_floors_pre_eq','age','area_percentage','height_percentage','count_families']
cat_df = train_df[[x for x in train_df.columns if x not in numerical_vars and 'has_superstructure_' not in x and 'has_secondary_use' not in x]].copy()
cat_df.head(2)

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,legal_ownership_status
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
802906,1st_quartile,2nd_quartile,4th_quartile,t,r,n,f,q,t,d,v
28830,2nd_quartile,3rd_quartile,1st_quartile,o,r,n,x,q,s,d,v


In [6]:
# one hot encoding
cat_ohe = pd.get_dummies(cat_df)

In [7]:
# merge one hote encoded with numerical features 
train_df1 = train_df.drop(cat_df.columns, axis=1)
df = pd.concat([train_df1, cat_ohe],axis=1)

**Scaling**

In [102]:
scaler = MinMaxScaler() 
df_scaled = scaler.fit_transform(df)
df = pd.DataFrame(df_scaled, columns = df.columns)
df.head(2)

Unnamed: 0,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,...,plan_configuration_m,plan_configuration_n,plan_configuration_o,plan_configuration_q,plan_configuration_s,plan_configuration_u,legal_ownership_status_a,legal_ownership_status_r,legal_ownership_status_v,legal_ownership_status_w
0,0.125,0.030151,0.050505,0.1,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.125,0.01005,0.070707,0.166667,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


## DBSCAN
https://towardsdatascience.com/5-ways-to-detect-outliers-that-every-data-scientist-should-know-python-code-70a54335a623

In [None]:
outlier_detection = DBSCAN(
 eps = .2, 
 metric="euclidean", 
 min_samples = 5,
 n_jobs = -1)

clusters = outlier_detection.fit_predict(df)

In [None]:
clusters

## Isolation Forest