In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.mixture import GaussianMixture

# Get Data

In [3]:
train = pd.read_csv("Data/train_w_nan.csv", index_col = 0)
train.head()

Unnamed: 0_level_0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f111,f112,f113,f114,f115,f116,f117,f118,claim,count_na
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.10859,0.004314,-37.566,0.017364,0.28915,-10.251,135.12,168900.0,399240000000000.0,86.489,...,1.7482,1.9096,-7.1157,4378.8,1.2096,861340000000000.0,140.1,1.0177,1,1
1,0.1009,0.29961,11822.0,0.2765,0.4597,-0.83733,1721.9,119810.0,3874100000000000.0,9953.6,...,4.1684,0.34808,4.142,913.23,1.2464,7575100000000000.0,1861.0,0.28359,0,0
2,0.17803,-0.00698,907.27,0.27214,0.45948,0.17327,2298.0,360650.0,12245000000000.0,15827.0,...,1.2042,0.2629,8.1312,45119.0,1.1764,321810000000000.0,3838.2,0.4069,1,5
3,0.15236,0.007259,780.1,0.025179,0.51947,7.4914,112.51,259490.0,77814000000000.0,-36.837,...,2.0694,0.79631,-16.336,4952.4,1.1784,4533000000000.0,4889.1,0.51486,1,2
4,0.11623,0.5029,-109.15,0.29791,0.3449,-0.40932,2538.9,65332.0,1907200000000000.0,144.12,...,1.5298,1.1464,-0.43124,3856.5,1.483,-8991300000000.0,3959.204669,0.23049,1,8


In [4]:
train.shape

(957919, 120)

In [5]:
test = pd.read_csv("Data/test.csv", index_col = 0)
test.head()

Unnamed: 0_level_0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f109,f110,f111,f112,f113,f114,f115,f116,f117,f118
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
957919,0.16585,0.48705,1295.0,0.0231,0.319,0.90188,573.29,3743.7,2705700000000.0,6221.0,...,0.16253,-22.189,2.0655,0.43088,-10.741,81606.0,1.194,198040000000000.0,2017.1,0.46357
957920,0.12965,0.37348,1763.0,0.72884,0.33247,-1.2631,875.55,554370.0,595570000000000.0,934.43,...,0.81528,-1.6342,1.5736,-1.0712,11.832,90114.0,1.1507,4.388e+16,6638.9,0.28125
957921,0.12019,0.44521,736.26,0.04615,0.29605,0.31665,2659.5,317140.0,397780000000000.0,131.81,...,0.81831,-32.78,2.1364,-1.9312,-3.2804,37739.0,1.1548,171810000000000.0,5844.0,0.13797
957922,0.054008,0.39596,996.14,0.85934,0.36678,-0.1706,386.56,325680.0,-34322000000000.0,-26.473,...,0.86559,-2.4162,1.5199,-0.011633,1.384,26849.0,1.149,2.1388e+17,6173.3,0.3291
957923,0.079947,-0.006919,10574.0,0.34845,0.45008,-1.842,3027.0,428150.0,929150000000.0,5999.4,...,0.2519,-18.63,3.7387,0.75708,-4.9405,50336.0,1.2488,2.1513e+17,2250.1,0.33796


In [6]:
test.shape

(493474, 118)

# Per feature check if it is multimodal

In [11]:
n_modes_per_feature = dict()

aics_per_feature = dict()
bics_per_feature = dict()

for i in range(1, 119):
    
    feature_name = f"f{i}"
    
    aics = []
    bics = []
    
    X = train[[feature_name]].dropna()
    
    for n_components in range(1, 6):
    
        gmm = GaussianMixture(n_components)
        gmm.fit(X)
        
        aics.append(gmm.aic(X))
        bics.append(gmm.bic(X))
        
    ideal_n_components = np.argmin(aics) + 1
    
    n_modes_per_feature[feature_name] = ideal_n_components
    aics_per_feature[feature_name] = aics
    bics_per_feature[feature_name] = bics

In [13]:
aics_per_feature['f1']

[-3300246.3613038464,
 -3297146.751896018,
 -3430299.544208436,
 -3434078.5853733923,
 -3477334.4291125475]

In [14]:
bics_per_feature['f1']

[-3300222.8162668417,
 -3297087.889303507,
 -3430205.3640604177,
 -3433949.087669868,
 -3477169.6138535165]

In [12]:
n_modes_per_feature

{'f1': 5,
 'f2': 5,
 'f3': 5,
 'f4': 5,
 'f5': 5,
 'f6': 4,
 'f7': 5,
 'f8': 5,
 'f9': 5,
 'f10': 5,
 'f11': 5,
 'f12': 5,
 'f13': 3,
 'f14': 5,
 'f15': 5,
 'f16': 5,
 'f17': 5,
 'f18': 5,
 'f19': 5,
 'f20': 4,
 'f21': 5,
 'f22': 4,
 'f23': 4,
 'f24': 5,
 'f25': 5,
 'f26': 5,
 'f27': 5,
 'f28': 5,
 'f29': 5,
 'f30': 5,
 'f31': 5,
 'f32': 4,
 'f33': 4,
 'f34': 5,
 'f35': 5,
 'f36': 5,
 'f37': 5,
 'f38': 5,
 'f39': 5,
 'f40': 5,
 'f41': 5,
 'f42': 5,
 'f43': 3,
 'f44': 5,
 'f45': 5,
 'f46': 5,
 'f47': 5,
 'f48': 4,
 'f49': 5,
 'f50': 5,
 'f51': 4,
 'f52': 5,
 'f53': 5,
 'f54': 5,
 'f55': 4,
 'f56': 5,
 'f57': 5,
 'f58': 4,
 'f59': 5,
 'f60': 5,
 'f61': 5,
 'f62': 5,
 'f63': 5,
 'f64': 5,
 'f65': 5,
 'f66': 4,
 'f67': 5,
 'f68': 5,
 'f69': 5,
 'f70': 5,
 'f71': 5,
 'f72': 5,
 'f73': 5,
 'f74': 5,
 'f75': 5,
 'f76': 5,
 'f77': 5,
 'f78': 5,
 'f79': 4,
 'f80': 4,
 'f81': 5,
 'f82': 5,
 'f83': 5,
 'f84': 5,
 'f85': 5,
 'f86': 5,
 'f87': 5,
 'f88': 5,
 'f89': 5,
 'f90': 4,
 'f91': 5,
 'f92': 