In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="whitegrid")
import warnings
warnings.filterwarnings("ignore")

In [3]:
import featuretools as ft

In [2]:
train_df = pd.read_csv('input/train.csv', na_values=['NAN','NA','NaN','na','nan'])
test_df = pd.read_csv('input/test.csv', na_values=['NAN','NA','NaN','na','nan'])
X_all = pd.read_csv('input/X_all.csv', na_values=['NAN','NA','NaN','na','nan'])

In [4]:
es = ft.EntitySet(id="hospitals")

In [5]:
es = es.entity_from_dataframe(entity_id="clients",
                                        dataframe=X_all,
                                        index="inst_id",
                                        variable_types={"sido": ft.variable_types.Categorical,
                                                        "employee1": ft.variable_types.Categorical,
                                                        "employee2": ft.variable_types.Categorical,
                                                        "instkind": ft.variable_types.Categorical,
                                                        "ownerChange": ft.variable_types.Categorical})

In [6]:
es = es.normalize_entity(base_entity_id="clients",
                            new_entity_id="city",
                            index="sido")
es = es.normalize_entity(base_entity_id="clients",
                            new_entity_id="kind",
                            index="instkind")
es = es.normalize_entity(base_entity_id="clients",
                            new_entity_id="change",
                            index="ownerChange")

In [7]:
es

Entityset: hospitals
  Entities:
    change [Rows: 3, Columns: 1]
    kind [Rows: 8, Columns: 1]
    clients [Rows: 428, Columns: 57]
    city [Rows: 17, Columns: 1]
  Relationships:
    clients.sido -> city.sido
    clients.instkind -> kind.instkind
    clients.ownerChange -> change.ownerChange

In [8]:
feature_matrix, feature_defs = ft.dfs(entityset=es,
                                        target_entity="clients",
                                        agg_primitives=["count", "mean", "sum", "std", "median", "mode"],
                                        trans_primitives=["month","diff","week"],
                                        max_depth=2)

In [9]:
print('Original',X_all.shape, 'Featured', feature_matrix.shape)

Original (428, 57) Featured (428, 822)


In [10]:
feature_matrix.head()

Unnamed: 0_level_0,noi1,tanAsset1,debt2,netAsset1,profit1,longLoan1,longLoan2,noi2,receivableL2,nonCAsset1,...,kind.STD(clients.profit2),city.STD(clients.longLoan1),change.MEDIAN(clients.revenue1),kind.STD(clients.salescost2),city.MEDIAN(clients.salescost1),kind.SUM(clients.sga1),city.SUM(clients.OnonCAsset2),kind.SUM(clients.quickAsset2),kind.SUM(clients.salescost1),change.MEDIAN(clients.shortLoan1)
inst_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,15652440.0,2360684000.0,758993700.0,2844460000.0,225169678.0,351000000.0,390000000.0,16194680.0,0.0,2514586000.0,...,580607700.0,4922569000.0,6303899000.0,621251100.0,108424820.0,1112561000000.0,2742782000.0,309962800000.0,68054390000.0,517485993.0
2,4314093000.0,46299310000.0,55406430000.0,6295769000.0,503669196.0,6319097000.0,4690000000.0,4076448000.0,0.0,48072820000.0,...,2450469000.0,8311996000.0,6303899000.0,18037270000.0,154633402.0,1885607000000.0,16341690000.0,531573800000.0,507880400000.0,517485993.0
3,,,,,,,,,,,...,2450469000.0,3068164000.0,6303899000.0,18037270000.0,367242573.0,1885607000000.0,5939987000.0,531573800000.0,507880400000.0,517485993.0
4,76156.0,120481000.0,0.0,300508800.0,41864754.0,0.0,0.0,0.0,0.0,120481000.0,...,580607700.0,7233080000.0,6303899000.0,621251100.0,277802982.0,1112561000000.0,19583620000.0,309962800000.0,68054390000.0,517485993.0
5,68710240.0,66786480000.0,67308380000.0,14704470000.0,452555746.0,17200000000.0,18300000000.0,191802200.0,0.0,75110100000.0,...,1484833000.0,10032840000.0,6303899000.0,2863205000.0,0.0,1372868000000.0,7157730000.0,379923800000.0,207512400000.0,517485993.0


In [11]:
feature_defs

[<Feature: noi1>,
 <Feature: tanAsset1>,
 <Feature: debt2>,
 <Feature: netAsset1>,
 <Feature: profit1>,
 <Feature: longLoan1>,
 <Feature: longLoan2>,
 <Feature: noi2>,
 <Feature: receivableL2>,
 <Feature: nonCAsset1>,
 <Feature: liquidLiabilities1>,
 <Feature: liquidAsset1>,
 <Feature: inventoryAsset1>,
 <Feature: ownerChange>,
 <Feature: NCLiabilities2>,
 <Feature: liquidAsset2>,
 <Feature: salary2>,
 <Feature: noe2>,
 <Feature: quickAsset1>,
 <Feature: shortLoan1>,
 <Feature: interest1>,
 <Feature: employee1>,
 <Feature: revenue1>,
 <Feature: salescost1>,
 <Feature: shortLoan2>,
 <Feature: NCLiabilities1>,
 <Feature: ctax1>,
 <Feature: receivableL1>,
 <Feature: profit2>,
 <Feature: nonCAsset2>,
 <Feature: surplus2>,
 <Feature: instkind>,
 <Feature: salary1>,
 <Feature: revenue2>,
 <Feature: tanAsset2>,
 <Feature: netAsset2>,
 <Feature: sido>,
 <Feature: salescost2>,
 <Feature: bedCount>,
 <Feature: interest2>,
 <Feature: sga2>,
 <Feature: sga1>,
 <Feature: noe1>,
 <Feature: employee2

In [12]:
feature_matrix_enc, features_enc = ft.encode_features(feature_matrix, feature_defs)

In [13]:
print('Original',X_all.shape, 'Featured', feature_matrix.shape, 'Featured Categorical', feature_matrix_enc.shape)

Original (428, 57) Featured (428, 822) Featured Categorical (428, 930)


In [14]:
features_enc

[<Feature: noi1>,
 <Feature: tanAsset1>,
 <Feature: debt2>,
 <Feature: netAsset1>,
 <Feature: profit1>,
 <Feature: longLoan1>,
 <Feature: longLoan2>,
 <Feature: noi2>,
 <Feature: receivableL2>,
 <Feature: nonCAsset1>,
 <Feature: liquidLiabilities1>,
 <Feature: liquidAsset1>,
 <Feature: inventoryAsset1>,
 <Feature: ownerChange = same>,
 <Feature: ownerChange = change>,
 <Feature: ownerChange = unknown>,
 <Feature: NCLiabilities2>,
 <Feature: liquidAsset2>,
 <Feature: salary2>,
 <Feature: noe2>,
 <Feature: quickAsset1>,
 <Feature: shortLoan1>,
 <Feature: interest1>,
 <Feature: employee1 = 79.0>,
 <Feature: employee1 = 73.0>,
 <Feature: employee1 = 54.0>,
 <Feature: employee1 = 62.0>,
 <Feature: employee1 = 60.0>,
 <Feature: employee1 = 50.0>,
 <Feature: employee1 = 42.0>,
 <Feature: employee1 = 77.0>,
 <Feature: employee1 = 75.0>,
 <Feature: employee1 = 67.0>,
 <Feature: employee1 = unknown>,
 <Feature: revenue1>,
 <Feature: salescost1>,
 <Feature: shortLoan2>,
 <Feature: NCLiabilities1>

In [16]:
from category_encoders import *

In [17]:
cols_num = list(X_all.select_dtypes(include=['int64','float64']).columns)
cols_cat = list(X_all.select_dtypes(include=['object']).columns)

In [19]:
train_df['OC'][train_df['OC'] == 'open'] = 0
train_df['OC'][train_df['OC'] == ' close'] = 1

In [20]:
train_df['OC'].value_counts()

0    286
1     15
Name: OC, dtype: int64

In [24]:
X_train, X_test = feature_matrix_enc.iloc[:train_df.shape[0], :], feature_matrix_enc.iloc[train_df.shape[0]:, :] 

In [21]:
enc = TargetEncoder(cols=cols_cat).fit(train_df[cols_cat], train_df['OC'])

In [22]:
X_te = enc.transform(train_df[cols_cat])
X_test_te = enc.transform(test_df[cols_cat])

In [25]:
cols_cat_2 = list()
for c in cols_cat:
    cols_cat_2.append(c+'_enc')
cols = np.concatenate([X_train.columns.values,cols_cat_2],axis=0)

In [26]:
X = np.concatenate([X_train.as_matrix(), X_te.as_matrix()], axis=1)
X_test = np.concatenate([X_test.as_matrix(), X_test_te.as_matrix()], axis=1)

In [27]:
X = pd.DataFrame(X)
X_test = pd.DataFrame(X_test)
X.columns = cols
X_test.columns = cols

In [28]:
X.head()

Unnamed: 0,noi1,tanAsset1,debt2,netAsset1,profit1,longLoan1,longLoan2,noi2,receivableL2,nonCAsset1,...,city.SUM(clients.OnonCAsset2),kind.SUM(clients.quickAsset2),kind.SUM(clients.salescost1),change.MEDIAN(clients.shortLoan1),sido_enc,openDate_enc,instkind_enc,employee1_enc,employee2_enc,ownerChange_enc
0,15652440.0,2360684000.0,758993700.0,2844460000.0,225169678.0,351000000.0,390000000.0,16194680.0,0.0,2514586000.0,...,2742782000.0,309962800000.0,68054390000.0,517485993.0,0.04347826,0.049834,0.034722,0.000896,0.013402,0.020243
1,4314093000.0,46299310000.0,55406430000.0,6295769000.0,503669196.0,6319097000.0,4690000000.0,4076448000.0,0.0,48072820000.0,...,16341690000.0,531573800000.0,507880400000.0,517485993.0,8.852277000000001e-17,0.049834,0.027027,0.049834,0.049834,0.020243
2,,,,,,,,,,,...,5939987000.0,531573800000.0,507880400000.0,517485993.0,0.1020408,0.049834,0.034722,0.049834,0.000896,0.020243
3,76156.0,120481000.0,0.0,300508800.0,41864754.0,0.0,0.0,0.0,0.0,120481000.0,...,19583620000.0,309962800000.0,68054390000.0,517485993.0,6.149231e-06,0.049834,0.027027,0.049834,0.049834,0.020243
4,68710240.0,66786480000.0,67308380000.0,14704470000.0,452555746.0,17200000000.0,18300000000.0,191802200.0,0.0,75110100000.0,...,7157730000.0,379923800000.0,207512400000.0,517485993.0,8.852277000000001e-17,0.049834,0.027027,0.049834,0.049834,0.020243


In [29]:
X_test.head()

Unnamed: 0,noi1,tanAsset1,debt2,netAsset1,profit1,longLoan1,longLoan2,noi2,receivableL2,nonCAsset1,...,city.SUM(clients.OnonCAsset2),kind.SUM(clients.quickAsset2),kind.SUM(clients.salescost1),change.MEDIAN(clients.shortLoan1),sido_enc,openDate_enc,instkind_enc,employee1_enc,employee2_enc,ownerChange_enc
0,316745396.0,9940461000.0,10328180000.0,5081292000.0,432511000.0,7440000000.0,7512550000.0,48705671.0,0.0,10291480000.0,...,5939987000.0,379923800000.0,207512400000.0,517485993.0,6e-06,0.049834,0.027027,0.049834,0.049834,0.020243
1,18142753.0,3555309000.0,1266739000.0,5288964000.0,241432600.0,0.0,0.0,18171830.0,0.0,3595463000.0,...,5939987000.0,309962800000.0,68054390000.0,517485993.0,0.049834,0.049834,0.056818,0.049834,0.049834,0.020243
2,80103616.0,0.0,0.0,0.0,891230.0,0.0,0.0,14024602.0,0.0,0.0,...,2587327000.0,309962800000.0,68054390000.0,517485993.0,0.142857,0.049834,0.027027,0.049834,0.049834,0.049834
3,423628901.0,12656390000.0,15632170000.0,15541790000.0,1692204000.0,0.0,0.0,483447584.0,0.0,13076230000.0,...,2587327000.0,309962800000.0,68054390000.0,,0.038462,0.049834,0.027027,0.049834,0.049834,0.020243
4,823222665.0,4734251000.0,4494217000.0,2240287000.0,35482330.0,1594400000.0,1740600000.0,404832240.0,0.0,5247103000.0,...,5939987000.0,379923800000.0,207512400000.0,517485993.0,0.111111,0.049834,0.027027,0.049834,0.049834,0.020243


In [30]:
X.to_csv('input/X.csv', index=False)
X_test.to_csv('input/X_test.csv', index=False)