In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def get_top_corr(df, rate='', threshold=0.15):
    cor_feat = df.dropna().corr()[[rate]]
    cor_feat = cor_feat.abs().sort_values(rate, ascending=False)
    
    feats=[]
    corr=[]
    for i in range(len(cor_feat)):
        feats.append(cor_feat.iloc[i].name)
        corr.append(cor_feat.iloc[i].values[0])
        
    out_df = pd.DataFrame()
    out_df['features'] = feats
    out_df['correlation'] = corr
    out_df = out_df[out_df['correlation']>=threshold]
    return out_df

In [5]:
rain_df = pd.read_csv('data/rainfall_tumbol.csv')[['tumbol_ID','sum_rain_y']]
rain_df.columns = ['tumbol_ID','sum_rain']
rain_df

Unnamed: 0,tumbol_ID,sum_rain
0,100101.0,2532.82
1,100102.0,2532.82
2,100103.0,2532.82
3,100104.0,2532.82
4,100105.0,2532.82
...,...,...
7420,961203.0,3265.80
7421,961204.0,4163.25
7422,961301.0,2926.21
7423,961302.0,2863.12


In [9]:
train_df = pd.read_csv('data/dist_train.csv')[['tumbol_ID',
                                               'poverty_rate_living',
                                               'poverty_rate_health',
                                               'poverty_rate_education',
                                               'poverty_rate_income',
                                               'poverty_rate_accessibility'
                                              ]]
train_df = train_df.merge(rain_df, on='tumbol_ID', how='left')
train_df

Unnamed: 0,tumbol_ID,poverty_rate_living,poverty_rate_health,poverty_rate_education,poverty_rate_income,poverty_rate_accessibility,sum_rain
0,110113,0.290314,2.639219,0.211138,5.991027,0.000000,2414.99
1,110115,0.070353,0.689461,0.893485,0.506543,0.000000,2414.99
2,110117,0.019508,0.045520,0.253609,0.026011,0.013006,2414.99
3,110151,0.897994,9.800758,0.259576,1.262803,0.000000,
4,110155,0.737004,1.182570,0.033628,1.527252,0.002802,
...,...,...,...,...,...,...,...
4546,961105,1.236233,1.326141,7.687121,9.103169,0.000000,2863.12
4547,961201,12.716118,7.222532,18.837144,10.596765,0.000000,4181.46
4548,961202,0.044823,0.448229,15.105334,6.260272,0.000000,3668.46
4549,961302,4.445484,4.220870,26.607394,11.371081,0.065512,2863.12


In [11]:
get_top_corr(train_df, rate='poverty_rate_living', threshold=0)

Unnamed: 0,features,correlation
0,poverty_rate_living,1.0
1,poverty_rate_education,0.405841
2,poverty_rate_health,0.40507
3,poverty_rate_income,0.384615
4,poverty_rate_accessibility,0.215811
5,tumbol_ID,0.153281
6,sum_rain,0.054908


In [18]:
train_df = train_df[['tumbol_ID',
    'sum_rain',
    'poverty_rate_living',
    'poverty_rate_health',
    'poverty_rate_education',
    'poverty_rate_income',
    'poverty_rate_accessibility',
 ]]
train_df

Unnamed: 0,tumbol_ID,sum_rain,poverty_rate_living,poverty_rate_health,poverty_rate_education,poverty_rate_income,poverty_rate_accessibility
0,110113,2414.99,0.290314,2.639219,0.211138,5.991027,0.000000
1,110115,2414.99,0.070353,0.689461,0.893485,0.506543,0.000000
2,110117,2414.99,0.019508,0.045520,0.253609,0.026011,0.013006
3,110151,,0.897994,9.800758,0.259576,1.262803,0.000000
4,110155,,0.737004,1.182570,0.033628,1.527252,0.002802
...,...,...,...,...,...,...,...
4546,961105,2863.12,1.236233,1.326141,7.687121,9.103169,0.000000
4547,961201,4181.46,12.716118,7.222532,18.837144,10.596765,0.000000
4548,961202,3668.46,0.044823,0.448229,15.105334,6.260272,0.000000
4549,961302,2863.12,4.445484,4.220870,26.607394,11.371081,0.065512


In [19]:
train_df.to_csv('rain_train.csv', index=False)