In [8]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score as acc
from sklearn.preprocessing import StandardScaler as ss
from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import LogisticRegression as LR
import category_encoders as ce
%matplotlib inline
import matplotlib.pyplot as plt

In [245]:
train_features = pd.read_csv('https://drive.google.com/uc?export=download&id=14ULvX0uOgftTB2s97uS8lIx1nHGQIB0P')
train_labels = pd.read_csv('https://drive.google.com/uc?export=download&id=1r441wLr7gKGHGLyPpKauvCuUOU556S2f')
test_features = pd.read_csv('https://drive.google.com/uc?export=download&id=1wvsYl9hbRbZuIuoaLWCsW_kbcxCdocHz')
sample_submission = pd.read_csv('https://drive.google.com/uc?export=download&id=1kfJewnmhowpUo381oSn3XqsQ6Eto23XV')
train_features.shape, train_labels.shape, test_features.shape, sample_submission.shape

((59400, 40), (59400, 2), (14358, 40), (14358, 2))

In [76]:
train_labels['status_group'].value_counts()

functional                 32259
non functional             22824
functional needs repair     4317
Name: status_group, dtype: int64

In [139]:
train_labels['baseline'] = 'functional'

In [79]:
train_labels.shape

(59400, 3)

In [92]:
train_features.isna().sum()

id                           0
amount_tsh                   0
date_recorded                0
funder                    3635
gps_height                   0
installer                 3655
longitude                    0
latitude                     0
wpt_name                     0
num_private                  0
basin                        0
subvillage                 371
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population                   0
public_meeting            3334
recorded_by                  0
scheme_management         3877
scheme_name              28166
permit                    3056
construction_year            0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_

In [41]:
train_features['construction_year'].value_counts()

0       20709
2010     2645
2008     2613
2009     2533
2000     2091
2007     1587
2006     1471
2003     1286
2011     1256
2004     1123
2012     1084
2002     1075
1978     1037
1995     1014
2005     1011
1999      979
1998      966
1990      954
1985      945
1980      811
1996      811
1984      779
1982      744
1994      738
1972      708
1974      676
1997      644
1992      640
1993      608
2001      540
1988      521
1983      488
1975      437
1986      434
1976      414
1970      411
1991      324
1989      316
1987      302
1981      238
1977      202
1979      192
1973      184
2013      176
1971      145
1960      102
1967       88
1963       85
1968       77
1969       59
1964       40
1962       30
1961       21
1965       19
1966       17
Name: construction_year, dtype: int64

In [246]:
train_features['year'] = train_features['construction_year'].map(lambda x: np.nan if x == 0 else int(x))
test_features['year'] = test_features['construction_year'].map(lambda x: np.nan if x == 0 else int(x))
train_year_mean = int(train_features['year'].mean(skipna=True))
test_year_mean = int(test_features['year'].mean(skipna=True))
train_features['year'] = train_features['year'].fillna(train_year_mean)
test_features['year'] = test_features['year'].fillna(test_year_mean)
train_features['year'] = train_features['year'].astype(int)
test_features['year'] = test_features['year'].astype(int)
train_features['year'] = pd.to_datetime(train_features['year'], format='%Y')
test_features['year'] = pd.to_datetime(test_features['year'], format='%Y')
train_features.shape, test_features.shape

((59400, 41), (14358, 41))

In [224]:
now = pd.Timestamp.now()
now

Timestamp('2019-06-03 22:45:19.901951')

In [247]:
train_features['age'] = (now - train_features['year']).dt.days
test_features['age'] = (now - test_features['year']).dt.days
train_features.shape, test_features.shape

((59400, 42), (14358, 42))

In [43]:
train_features['population'].value_counts()

0       21381
1        7025
200      1940
150      1892
250      1681
300      1476
100      1146
50       1139
500      1009
350       986
120       916
400       775
60        706
30        626
40        552
80        533
450       499
20        462
600       438
230       388
75        289
1000      278
800       269
90        265
130       264
25        255
320       249
35        245
360       222
140       215
        ...  
8848        1
628         1
4520        1
468         1
693         1
725         1
789         1
821         1
5300        1
3127        1
2345        1
3031        1
886         1
392         1
424         1
2807        1
726         1
694         1
2569        1
4788        1
662         1
4660        1
406         1
1032        1
1160        1
3241        1
1960        1
1685        1
2248        1
1439        1
Name: population, Length: 1049, dtype: int64

In [248]:
train_features['population'] = train_features['population'].map(lambda x: np.nan if x == 0 else x)
test_features['population'] = test_features['population'].map(lambda x: np.nan if x == 0 else x)
train_features['population'].fillna(train_features['population'].mean(skipna=True), inplace=True)
test_features['population'].fillna(test_features['population'].mean(skipna=True), inplace=True)
train_features.shape, test_features.shape

((59400, 42), (14358, 42))

In [45]:
train_features['source'].value_counts()

spring                  17021
shallow well            16824
machine dbh             11075
river                    9612
rainwater harvesting     2295
hand dtw                  874
lake                      765
dam                       656
other                     212
unknown                    66
Name: source, dtype: int64

In [46]:
train_features['source_type'].value_counts()

spring                  17021
shallow well            16824
borehole                11949
river/lake              10377
rainwater harvesting     2295
dam                       656
other                     278
Name: source_type, dtype: int64

In [47]:
train_features['source_class'].value_counts()

groundwater    45794
surface        13328
unknown          278
Name: source_class, dtype: int64

In [48]:
train_features['waterpoint_type'].value_counts()

communal standpipe             28522
hand pump                      17488
other                           6380
communal standpipe multiple     6103
improved spring                  784
cattle trough                    116
dam                                7
Name: waterpoint_type, dtype: int64

In [50]:
train_features['waterpoint_type_group'].value_counts()

communal standpipe    34625
hand pump             17488
other                  6380
improved spring         784
cattle trough           116
dam                       7
Name: waterpoint_type_group, dtype: int64

In [51]:
train_features['quality_group'].value_counts()

good        50818
salty        5195
unknown      1876
milky         804
colored       490
fluoride      217
Name: quality_group, dtype: int64

In [53]:
train_features['extraction_type_class'].value_counts()

gravity         26780
handpump        16456
other            6430
submersible      6179
motorpump        2987
rope pump         451
wind-powered      117
Name: extraction_type_class, dtype: int64

In [81]:
train_features['date_recorded'].value_counts()

2011-03-15    572
2011-03-17    558
2013-02-03    546
2011-03-14    520
2011-03-16    513
2011-03-18    497
2011-03-19    466
2013-02-04    464
2013-01-29    459
2011-03-04    458
2013-02-14    444
2013-01-24    435
2011-03-05    434
2013-02-15    429
2013-03-15    428
2011-03-11    426
2013-01-30    421
2013-02-16    418
2011-03-23    417
2011-03-09    416
2013-01-18    409
2011-03-30    391
2013-02-26    391
2011-03-24    381
2013-03-19    381
2013-02-13    380
2013-01-23    379
2011-03-12    379
2011-03-03    378
2013-01-28    376
             ... 
2011-09-06      1
2004-07-01      1
2002-10-14      1
2011-08-30      1
2011-09-26      1
2004-01-07      1
2013-01-06      1
2004-04-01      1
2011-09-27      1
2011-09-18      1
2011-09-19      1
2004-03-06      1
2011-09-17      1
2011-09-12      1
2011-09-11      1
2011-09-09      1
2011-09-25      1
2011-09-01      1
2011-09-21      1
2011-09-15      1
2004-04-05      1
2011-09-16      1
2011-09-13      1
2011-09-14      1
2011-09-20

In [249]:
train_features['date_recorded'] = pd.to_datetime(train_features['date_recorded'])
test_features['date_recorded'] = pd.to_datetime(test_features['date_recorded'])
train_features['since_recording'] = (now - train_features['date_recorded']).dt.days
test_features['since_recording'] = (now - test_features['date_recorded']).dt.days
train_features.shape, test_features.shape

((59400, 43), (14358, 43))

In [201]:
train_features['since_recording'].head()

0    3003
1    2280
2    2289
3    2317
4    2882
Name: since_recording, dtype: int64

In [250]:
train_features = train_features.drop(columns='construction_year')
test_features = test_features.drop(columns='construction_year')
train_features.shape, test_features.shape

((59400, 42), (14358, 42))

In [254]:
train_features.select_dtypes('number').columns.drop('id').tolist()

['amount_tsh',
 'gps_height',
 'longitude',
 'latitude',
 'num_private',
 'region_code',
 'district_code',
 'population',
 'age',
 'since_recording']

In [251]:
cat_features = ['extraction_type_group',
                'source_type',
                'quality_group',
                'extraction_type',
                'quantity_group',
                'management_group',
                'basin',
                'payment_type',
                'permit',
                'payment'
                ]
num_features = train_features.select_dtypes('number').columns.drop('id').tolist()
features = cat_features + num_features

train = train_features[features]
y_train = train_labels['status_group']

test = test_features[features]

X_train, X_val, y_train, y_val = tts(train,
                                     y_train,
                                     train_size=.8,
                                     test_size=.2,
                                     stratify=y_train,
                                     random_state=42)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape, test.shape)

encoder = ce.OneHotEncoder(use_cat_names=True)
X_train_enc = encoder.fit_transform(X_train)
X_val_enc = encoder.transform(X_val)
X_test_features = encoder.transform(test)
print(X_train_enc.shape, X_val_enc.shape, test_features.shape)
scaler = ss()
X_train_scaled = scaler.fit_transform(X_train_enc)
X_val_scaled = scaler.transform(X_val_enc)
test_features_scaled = scaler.transform(X_test_features)
print(X_train_scaled.shape, X_val_scaled.shape, test_features_scaled.shape)



(47520, 20) (11880, 20) (47520,) (11880,) (14358, 20)
(47520, 100) (11880, 100) (14358, 42)
(47520, 100) (11880, 100) (14358, 100)


In [252]:
model = LR(solver='liblinear',
           multi_class='auto',
           max_iter=1000,
           penalty='l2',
#            l1_ratio=.5
          )
model.fit(X_train_scaled, y_train)

model.score(X_val_scaled, y_val)

0.7233164983164984

In [140]:
acc(train_labels['status_group'], train_labels['baseline'])

0.543080808080808

In [253]:
predicted = model.predict(test_features_scaled)
submission = sample_submission.copy()
submission['status_group'] = predicted
submission.to_csv('sub_1.csv', index=False)