In [1]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from keras import backend as K
import tensorflow as tf
from category_encoders.binary import BinaryEncoder
import category_encoders as ce

Using TensorFlow backend.


In [2]:
K.tensorflow_backend._get_available_gpus()

['/job:localhost/replica:0/task:0/device:GPU:0']

In [3]:
config = tf.ConfigProto( device_count = {'GPU': 1 , 'CPU': 8} ) 
sess = tf.Session(config=config) 
K.set_session(sess)

In [4]:
# fix random seed for reproducibility
seed = 42
np.random.seed(seed)

In [5]:
# load dataset
dataframe = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data", header=None)
dataset = dataframe.values
X = dataset[:,0:4].astype(float)
Y = dataset[:,4]

In [6]:
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

In [7]:
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)

In [8]:
url = "unit-3/train_features.csv"
df = pd.read_csv(url, parse_dates=['date_recorded'],index_col='id' )

In [9]:
df['timestamp'] = df.date_recorded.apply(lambda d: d.timestamp()/ 10 ** 9)
df_ = df.drop(['date_recorded', 'region_code', 'district_code', 'region'], inplace = False, axis=1)
# df_ = df.drop(['date_recorded', 'district_code', 'region'], inplace = False, axis=1)
df_['region_district'] = df.apply(lambda row: f'{row.region}_{row.district_code}' , axis=1)
# df_ = df_.apply(lambda x: x.astype(str).str.lower())
train_input_columns = list(df_.columns)
train_numeric_columns = df_.select_dtypes(exclude=['object']).columns
df_.head()

Unnamed: 0_level_0,amount_tsh,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,...,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,timestamp,region_district
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
69572,6000.0,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,Mnyusi B,...,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,1.300061,Iringa_5
8776,0.0,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,Nyamara,...,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,1.362528,Mara_2
34310,25.0,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,Pangani,Majengo,...,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,1.36175,Manyara_4
67743,0.0,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,Mahakamani,...,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,1.359331,Mtwara_63
19728,0.0,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,Lake Victoria,Kyanyamisa,...,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,1.310515,Kagera_1


In [10]:
dc = 'status_group'

In [11]:
yurl = 'unit-3/train_labels.csv'
dfy = pd.read_csv(yurl, index_col='id' )
dfy.shape
y = dfy[dc]

In [12]:
encoder.fit(y)
encoded_Y = encoder.transform(y)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

In [13]:
oc = df_.select_dtypes(include=['object']).columns
oc

Index(['funder', 'installer', 'wpt_name', 'basin', 'subvillage', 'lga', 'ward',
       'public_meeting', 'recorded_by', 'scheme_management', 'scheme_name',
       'permit', 'extraction_type', 'extraction_type_group',
       'extraction_type_class', 'management', 'management_group', 'payment',
       'payment_type', 'water_quality', 'quality_group', 'quantity',
       'quantity_group', 'source', 'source_type', 'source_class',
       'waterpoint_type', 'waterpoint_type_group', 'region_district'],
      dtype='object')

In [14]:
hot = []
binary = []
for o in oc:
    if df_[o].unique().shape[0] > 127:
        print(df_[o].unique().shape[0], o)
        binary.append(o)
    else:
        hot.append(o)

1898 funder
2146 installer
37400 wpt_name
19288 subvillage
2092 ward
2697 scheme_name
132 region_district


In [15]:
test_url = "unit-3/test_features.csv"
df = pd.read_csv(test_url, parse_dates=['date_recorded'],index_col='id' )
df['timestamp'] = df.date_recorded.apply(lambda d: d.timestamp()/ 10 ** 9)
dft = df.drop(['date_recorded', 'region_code', 'district_code', 'region'], inplace = False, axis=1)
#dft = df.drop(['date_recorded', 'district_code', 'region'], inplace = False, axis=1)
dft['region_district'] = df.apply(lambda row: f'{row.region}_{row.district_code}' , axis=1)
# dft = dft.apply(lambda x: x.astype(str).str.lower())
test_input_columns = list(dft.columns)
# dft[dc] = ['fuctional'] * dft.shape[0]
dft.head()

Unnamed: 0_level_0,amount_tsh,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,...,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,timestamp,region_district
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
50785,0.0,Dmdd,1996,DMDD,35.290799,-4.059696,Dinamu Secondary School,0,Internal,Magoma,...,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,other,other,1.359936,Manyara_3
51630,0.0,Government Of Tanzania,1569,DWE,36.656709,-3.309214,Kimnyak,0,Pangani,Kimnyak,...,good,insufficient,insufficient,spring,spring,groundwater,communal standpipe,communal standpipe,1.359936,Arusha_2
17168,0.0,,1567,,34.767863,-5.004344,Puma Secondary,0,Internal,Msatu,...,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,other,other,1.359677,Singida_2
45559,0.0,Finn Water,267,FINN WATER,38.058046,-9.418672,Kwa Mzee Pange,0,Ruvuma / Southern Coast,Kipindimbi,...,good,dry,dry,shallow well,shallow well,groundwater,other,other,1.358813,Lindi_43
49871,500.0,Bruder,1260,BRUDER,35.006123,-10.950412,Kwa Mzee Turuka,0,Ruvuma / Southern Coast,Losonga,...,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,1.364342,Ruvuma_3


In [16]:
test__ = list(dft.columns)
for c in train_input_columns:
    if c not in test_input_columns:
        print(f'{c} not in test')
    else:
        test__.remove(c)
print(test__)
        

[]


In [17]:
encoders = Pipeline([
#                 ('vect', tfidf),
                ('binary', BinaryEncoder(cols=binary)),
                ('onehot', ce.OneHotEncoder(use_cat_names=True,cols=hot))
            ])
df_l = df_.shape[0]
both = pd.concat([df_,dft])
print(df_l)
both.head()

59400


Unnamed: 0_level_0,amount_tsh,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,...,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,timestamp,region_district
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
69572,6000.0,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,Mnyusi B,...,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,1.300061,Iringa_5
8776,0.0,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,Nyamara,...,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,1.362528,Mara_2
34310,25.0,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,Pangani,Majengo,...,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,1.36175,Manyara_4
67743,0.0,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,Mahakamani,...,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,1.359331,Mtwara_63
19728,0.0,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,Lake Victoria,Kyanyamisa,...,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,1.310515,Kagera_1


In [18]:
both_  = encoders.fit_transform(both)

In [19]:
df__  = both_.iloc[0:df_l]
dft_ = both_.iloc[df_l:]
df_l, both_.shape, df__.shape, dft_.shape

(59400, (73758, 404), (59400, 404), (14358, 404))

In [20]:
def pump_baseline_model():
	# create model
	model = Sequential()
	model.add(Dense(8, input_dim=404, activation='relu'))
	model.add(Dense(3, activation='softmax'))
	# Compile model
	model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
	return model

In [22]:
estimator = KerasClassifier(build_fn=pump_baseline_model, epochs=6000, batch_size=5, verbose=0)

In [23]:
# results = cross_val_score(estimator, df__, dummy_y, cv=kfold)
# print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

In [None]:
model = pump_baseline_model()
model.fit(x=df__, y=dummy_y,epochs=6000)

Epoch 1/6000
Epoch 2/6000
Epoch 3/6000
Epoch 4/6000
Epoch 5/6000
Epoch 6/6000
Epoch 7/6000
Epoch 8/6000
Epoch 9/6000
Epoch 10/6000
Epoch 11/6000
Epoch 12/6000
Epoch 13/6000
Epoch 14/6000
Epoch 15/6000
Epoch 16/6000
Epoch 17/6000
Epoch 18/6000
Epoch 19/6000
Epoch 20/6000
Epoch 21/6000
Epoch 22/6000
Epoch 23/6000
Epoch 24/6000
Epoch 25/6000
Epoch 26/6000
Epoch 27/6000
Epoch 28/6000
Epoch 29/6000
Epoch 30/6000
Epoch 31/6000
Epoch 32/6000
Epoch 33/6000
Epoch 34/6000
Epoch 35/6000
Epoch 36/6000
Epoch 37/6000
Epoch 38/6000
Epoch 39/6000
Epoch 40/6000
Epoch 41/6000
Epoch 42/6000
Epoch 43/6000
Epoch 44/6000
Epoch 45/6000
Epoch 46/6000
Epoch 47/6000
Epoch 48/6000
Epoch 49/6000
Epoch 50/6000
Epoch 51/6000
Epoch 52/6000
Epoch 53/6000
Epoch 54/6000
Epoch 55/6000
Epoch 56/6000
Epoch 57/6000
Epoch 58/6000
Epoch 59/6000
Epoch 60/6000
Epoch 61/6000
Epoch 62/6000
Epoch 63/6000
Epoch 64/6000
Epoch 65/6000
Epoch 66/6000
Epoch 67/6000
Epoch 68/6000
Epoch 69/6000
Epoch 70/6000
Epoch 71/6000
Epoch 72/6000
E

Epoch 81/6000
Epoch 82/6000
Epoch 83/6000
Epoch 84/6000
Epoch 85/6000
Epoch 86/6000
Epoch 87/6000
Epoch 88/6000
Epoch 89/6000
Epoch 90/6000
Epoch 91/6000
Epoch 92/6000
Epoch 93/6000
Epoch 94/6000
Epoch 95/6000
Epoch 96/6000
Epoch 97/6000
Epoch 98/6000
Epoch 99/6000
Epoch 100/6000
Epoch 101/6000
Epoch 102/6000
Epoch 103/6000
Epoch 104/6000
Epoch 105/6000
Epoch 106/6000
Epoch 107/6000
Epoch 108/6000
Epoch 109/6000
Epoch 110/6000
Epoch 111/6000
Epoch 112/6000
Epoch 113/6000
Epoch 114/6000
Epoch 115/6000
Epoch 116/6000
Epoch 117/6000
Epoch 118/6000
Epoch 119/6000
Epoch 120/6000
Epoch 121/6000
Epoch 122/6000
Epoch 123/6000
Epoch 124/6000
Epoch 125/6000
Epoch 126/6000
Epoch 127/6000
Epoch 128/6000
Epoch 129/6000
Epoch 130/6000
Epoch 131/6000
Epoch 132/6000
Epoch 133/6000
Epoch 134/6000
Epoch 135/6000
Epoch 136/6000
Epoch 137/6000
Epoch 138/6000
Epoch 139/6000
Epoch 140/6000
Epoch 141/6000
Epoch 142/6000
Epoch 143/6000
Epoch 144/6000
Epoch 145/6000
Epoch 146/6000
Epoch 147/6000
Epoch 148/6000

Epoch 160/6000
Epoch 161/6000
Epoch 162/6000
Epoch 163/6000
Epoch 164/6000
Epoch 165/6000
Epoch 166/6000
Epoch 167/6000
Epoch 168/6000
Epoch 169/6000
Epoch 170/6000
Epoch 171/6000
Epoch 172/6000
Epoch 173/6000
Epoch 174/6000
Epoch 175/6000
Epoch 176/6000
Epoch 177/6000
Epoch 178/6000
Epoch 179/6000
Epoch 180/6000
Epoch 181/6000
Epoch 182/6000
Epoch 183/6000
Epoch 184/6000
Epoch 185/6000
Epoch 186/6000
Epoch 187/6000
Epoch 188/6000
Epoch 189/6000
Epoch 190/6000
Epoch 191/6000
Epoch 192/6000
Epoch 193/6000
Epoch 194/6000
Epoch 195/6000
Epoch 196/6000
Epoch 197/6000
Epoch 198/6000
Epoch 199/6000
Epoch 200/6000
Epoch 201/6000
Epoch 202/6000
Epoch 203/6000
Epoch 204/6000
Epoch 205/6000
Epoch 206/6000
Epoch 207/6000
Epoch 208/6000
Epoch 209/6000
Epoch 210/6000
Epoch 211/6000
Epoch 212/6000
Epoch 213/6000
Epoch 214/6000
Epoch 215/6000
Epoch 216/6000
Epoch 217/6000
Epoch 218/6000
Epoch 219/6000
Epoch 220/6000
Epoch 221/6000
Epoch 222/6000
Epoch 223/6000
Epoch 224/6000
Epoch 225/6000
Epoch 226/

Epoch 238/6000
Epoch 239/6000
Epoch 240/6000
Epoch 241/6000
Epoch 242/6000
Epoch 243/6000
Epoch 244/6000
Epoch 245/6000
Epoch 246/6000
Epoch 247/6000
Epoch 248/6000
Epoch 249/6000
Epoch 250/6000
Epoch 251/6000
Epoch 252/6000
Epoch 253/6000
Epoch 254/6000
Epoch 255/6000
Epoch 256/6000
Epoch 257/6000
Epoch 258/6000
Epoch 259/6000
Epoch 260/6000
Epoch 261/6000
Epoch 262/6000
Epoch 263/6000
Epoch 264/6000
Epoch 265/6000
Epoch 266/6000
Epoch 267/6000
Epoch 268/6000
Epoch 269/6000
Epoch 270/6000
Epoch 271/6000
Epoch 272/6000
Epoch 273/6000
Epoch 274/6000
Epoch 275/6000
Epoch 276/6000
Epoch 277/6000
Epoch 278/6000
Epoch 279/6000
Epoch 280/6000
Epoch 281/6000
Epoch 282/6000
Epoch 283/6000
Epoch 284/6000
Epoch 285/6000
Epoch 286/6000
Epoch 287/6000
Epoch 288/6000
Epoch 289/6000
Epoch 290/6000
Epoch 291/6000
Epoch 292/6000
Epoch 293/6000
Epoch 294/6000
Epoch 295/6000
Epoch 296/6000
Epoch 297/6000
Epoch 298/6000
Epoch 299/6000
Epoch 300/6000
Epoch 301/6000
Epoch 302/6000
Epoch 303/6000
Epoch 304/

Epoch 316/6000
Epoch 317/6000
Epoch 318/6000
Epoch 319/6000
Epoch 320/6000
Epoch 321/6000
Epoch 322/6000
Epoch 323/6000
Epoch 324/6000
Epoch 325/6000
Epoch 326/6000
Epoch 327/6000
Epoch 328/6000
Epoch 329/6000
Epoch 330/6000
Epoch 331/6000
Epoch 332/6000
Epoch 333/6000
Epoch 334/6000
Epoch 335/6000
Epoch 336/6000
Epoch 337/6000
Epoch 338/6000
Epoch 339/6000
Epoch 340/6000
Epoch 341/6000
Epoch 342/6000
Epoch 343/6000
Epoch 344/6000
Epoch 345/6000
Epoch 346/6000
Epoch 347/6000
Epoch 348/6000
Epoch 349/6000
Epoch 350/6000
Epoch 351/6000
Epoch 352/6000
Epoch 353/6000
Epoch 354/6000
Epoch 355/6000
Epoch 356/6000
Epoch 357/6000
Epoch 358/6000
Epoch 359/6000
Epoch 360/6000

In [127]:
p = model.predict(dft_, batch_size=None, verbose=0, steps=None)
p

array([[0.22859693, 0.03930179, 0.73210126],
       [0.5831133 , 0.04652163, 0.37036508],
       [0.46959814, 0.21885042, 0.31155148],
       ...,
       [0.92457575, 0.0597909 , 0.01563349],
       [0.9147806 , 0.03263792, 0.05258147],
       [0.00662169, 0.00510542, 0.98827285]], dtype=float32)

In [128]:
p.shape

(14358, 3)

In [133]:
with open('testxgb.keras.csv', 'w') as f:
    f.write('id,status_group\n')
    for fu,i in zip(p, dft_.index):
        index = np.argmax(fu)
        d = 'non functional' if index == 2 else ('functional' if index == 0 else 'functional needs repair')
        f.write(f"{i},{d}")
        f.write('\n')

Your submission scored 0.75915