In [1]:
## run this notebook only after running the EDA notebook

In [2]:
!pip install Catboost 

Collecting Catboost
  Downloading catboost-1.0.5-cp37-none-manylinux1_x86_64.whl (76.6 MB)
[K     |████████████████████████████████| 76.6 MB 52 kB/s 
Installing collected packages: Catboost
Successfully installed Catboost-1.0.5


In [6]:
#importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import copy
import requests
from io import StringIO 

In [7]:
pd.set_option('display.max_columns',63)

In [8]:
# read the new formed data resulting from the EDA notebook,

train = 'https://drive.google.com/file/d/1ERjbGJ3JRLjJhnizgAWPu48uxmGyGZQP/view?usp=sharing'
test = 'https://drive.google.com/file/d/11gTjCB2mYaz2wyBK9vRIGU71gfjOR1H2/view?usp=sharing'

# function to read a csv file shared via google which returns a dataframe

def read_csv(url):
    url = 'https://drive.google.com/uc?export=download&id=' + url.split('/')[-2]
    csv_raw = requests.get(url).text
    csv = StringIO(csv_raw)
    df = pd.read_csv(csv)
    return df

# Creating training, testing dataframes
train = read_csv(train)
test = read_csv(test)

## Preprocessing

In [9]:
#creating function to strip the address(remove spaces)
def striper(sentence):
    return sentence.strip()

#All provinces in RSA
province = ['Eastern Cape', 'Free State', 'Gauteng', 'KwaZulu-Natal', 'Limpopo', 'Mpumalanga', 'Northern Cape', 'North West','Western Cape']

#A function to find which province a ward belongs to
def add_variables(sentence):
    temp = sentence.split(',')
    temp = map(striper,temp)
    for p in temp:
        if p in province:
            adress = p
            return adress


In [10]:
#Apply the above function and creating province column in the train data
train['province'] = np.vectorize(add_variables)(train['address'])
train.shape

(2822, 66)

In [11]:
#Apply the above function and creating province column in the test data
test['province'] = np.vectorize(add_variables)(test['address'])
test.shape

(1013, 65)

In [12]:
data = train.append(test)
data['province'].value_counts()

KwaZulu-Natal    715
Eastern Cape     626
Gauteng          469
Limpopo          453
Western Cape     387
Mpumalanga       356
North West       345
Free State       308
Northern Cape    176
Name: province, dtype: int64

In [13]:
## features transform

def transform(data):
    data['pw1'] = 1 - (data['pw_00'] + data['pw_06'])  # retaining only pw00,pw06 and summing the another piped water column 
    data['rich'] = (data['car_00']+data['lgt_00']+data['pw_00'])/3 #creating a rich column meaning having car,light and water inside ward
    data['psa_car'] = data['psa_00']/(data['car_00'] + data['car_01'])
    data['latlon'] = abs(data['lat']) + abs(data['lon'])

In [14]:
## features removing
## remove unuseful columns
def remove(data):
    data.drop(['total_individuals'],axis=1,inplace=True)
    data.drop(['total_households'],axis=1,inplace=True)
    data.drop(['address'],axis=1,inplace=True)
    data.drop(['lgt_00'],axis=1,inplace=True)
    data.drop(['psa_02','psa_03'],axis=1,inplace=True)
    data.drop(['lan_02','lan_03','lan_04','lan_05','lan_08','lan_07'],axis=1,inplace=True)
    data.drop(['dw_00','dw_02', 'dw_06'],axis=1,inplace=True)
    data.drop(['pw_01','pw_02','pw_03','pw_04','pw_05','pw_07','pw_08'],axis=1,inplace=True)

In [15]:
## remove data int typed except province

def remove_(data):
    for col in data.select_dtypes(int):
        if col != 'province':
            data.drop([col],axis=1,inplace=True)

In [16]:
## encode the province column 

def encoder(data):
    code = {
        'KwaZulu-Natal':8,
        'Eastern Cape':7,
        'Gauteng':6,
        'Limpopo':5,
        'Western Cape':4,
        'Mpumalanga':3,
        'North West':2,
        'Free State':1,
        'Northern Cape':0
    }
    data.loc[:,'province'] = data['province'].map(code)
    

In [17]:
## define a processing function 

def preprocessing(data):
    encoder(data)
    transform(data)
    remove(data)
    remove_(data)  

In [18]:
preprocessing(train)
preprocessing(test)

In [19]:
### preparing data for modelling

In [20]:
cols = ['ward','ADM4_PCODE','target']

In [21]:
in_cols = []
for col in train.columns:
    in_cols.append(col)
for col in cols:
    in_cols.remove(col)
ycol = 'target'

In [22]:
## getting X and y

X = train.drop(labels=cols,axis=1)
y = train['target']

## Modelling

In [23]:
from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import KFold,train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

In [24]:
## function to fit , predict and submit the answers with catboost 

def submission(name,num,model):
    errcb=[]
    y_pred_totcb=[]
    fold=KFold(n_splits=5,shuffle=True, random_state=1234)
    for train_index, test_index in fold.split(X,y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        model.fit(X_train,y_train,eval_set=[(X_train,y_train),(X_test, y_test)], early_stopping_rounds=100)
        preds=model.predict(X_test)
        print("err: ",np.sqrt(mean_squared_error(y_test,preds)))
        errcb.append(np.sqrt(mean_squared_error(y_test,preds)))
        p = model.predict(test[in_cols])
        y_pred_totcb.append(p)
    
    print("Average score in 5-fold CV:", np.mean(errcb))
    test['target'] = np.mean(y_pred_totcb,0)
    test[['ward', 'target']].to_csv(f"{name}_sub{num}.csv", index=False)

In [25]:
## define the model to use
model = CatBoostRegressor(loss_function='RMSE',depth= 6, metric_period=20)

In [26]:
#submit using the submission function
submission('Catboost2',19,model)

Learning rate set to 0.057863
0:	learn: 9.8563277	test: 9.8563277	test1: 9.9254270	best: 9.9254270 (0)	total: 59.6ms	remaining: 59.6s




20:	learn: 5.1938293	test: 5.1938293	test1: 5.3419404	best: 5.3419404 (20)	total: 289ms	remaining: 13.5s
40:	learn: 3.9411130	test: 3.9411130	test1: 4.1512472	best: 4.1512472 (40)	total: 512ms	remaining: 12s
60:	learn: 3.5392569	test: 3.5392569	test1: 3.7750591	best: 3.7750591 (60)	total: 737ms	remaining: 11.3s
80:	learn: 3.3331778	test: 3.3331778	test1: 3.6077995	best: 3.6077995 (80)	total: 974ms	remaining: 11s
100:	learn: 3.1861317	test: 3.1861317	test1: 3.5288427	best: 3.5288427 (100)	total: 1.22s	remaining: 10.9s
120:	learn: 3.0492721	test: 3.0492721	test1: 3.4670167	best: 3.4670167 (120)	total: 1.46s	remaining: 10.6s
140:	learn: 2.9393384	test: 2.9393384	test1: 3.4149499	best: 3.4149499 (140)	total: 1.7s	remaining: 10.3s
160:	learn: 2.8222386	test: 2.8222386	test1: 3.3735105	best: 3.3735105 (160)	total: 1.95s	remaining: 10.1s
180:	learn: 2.7007990	test: 2.7007990	test1: 3.3329355	best: 3.3329355 (180)	total: 2.2s	remaining: 9.97s
200:	learn: 2.5883292	test: 2.5883292	test1: 3.2914



20:	learn: 5.2024846	test: 5.2024846	test1: 5.2105942	best: 5.2105942 (20)	total: 262ms	remaining: 12.2s
40:	learn: 3.9355900	test: 3.9355900	test1: 4.0444020	best: 4.0444020 (40)	total: 493ms	remaining: 11.5s
60:	learn: 3.5396663	test: 3.5396663	test1: 3.7605436	best: 3.7605436 (60)	total: 739ms	remaining: 11.4s
80:	learn: 3.3230408	test: 3.3230408	test1: 3.6334522	best: 3.6334522 (80)	total: 991ms	remaining: 11.2s
100:	learn: 3.1640965	test: 3.1640965	test1: 3.5507671	best: 3.5507671 (100)	total: 1.24s	remaining: 11s
120:	learn: 3.0397174	test: 3.0397174	test1: 3.4957256	best: 3.4957256 (120)	total: 1.48s	remaining: 10.8s
140:	learn: 2.9215170	test: 2.9215170	test1: 3.4593906	best: 3.4593906 (140)	total: 1.72s	remaining: 10.5s
160:	learn: 2.8020827	test: 2.8020827	test1: 3.4156653	best: 3.4156653 (160)	total: 1.97s	remaining: 10.3s
180:	learn: 2.6961308	test: 2.6961308	test1: 3.3787320	best: 3.3787320 (180)	total: 2.22s	remaining: 10s
200:	learn: 2.5957126	test: 2.5957126	test1: 3.33



20:	learn: 5.2050436	test: 5.2050436	test1: 4.9954218	best: 4.9954218 (20)	total: 287ms	remaining: 13.4s
40:	learn: 3.9267711	test: 3.9267711	test1: 4.0857327	best: 4.0857327 (40)	total: 555ms	remaining: 13s
60:	learn: 3.4960248	test: 3.4960248	test1: 3.8417237	best: 3.8417237 (60)	total: 798ms	remaining: 12.3s
80:	learn: 3.2845016	test: 3.2845016	test1: 3.7398957	best: 3.7398957 (80)	total: 1.04s	remaining: 11.9s
100:	learn: 3.1360639	test: 3.1360639	test1: 3.6812406	best: 3.6812406 (100)	total: 1.28s	remaining: 11.4s
120:	learn: 3.0106690	test: 3.0106690	test1: 3.6377172	best: 3.6377172 (120)	total: 1.54s	remaining: 11.2s
140:	learn: 2.9031519	test: 2.9031519	test1: 3.6063746	best: 3.6045526 (139)	total: 1.78s	remaining: 10.9s
160:	learn: 2.8056462	test: 2.8056462	test1: 3.5668074	best: 3.5668074 (160)	total: 2.02s	remaining: 10.5s
180:	learn: 2.6882847	test: 2.6882847	test1: 3.5303978	best: 3.5303978 (180)	total: 2.26s	remaining: 10.2s
200:	learn: 2.5696211	test: 2.5696211	test1: 3.



20:	learn: 5.1390422	test: 5.1390422	test1: 5.5834936	best: 5.5834936 (20)	total: 247ms	remaining: 11.5s
40:	learn: 3.9083558	test: 3.9083558	test1: 4.2791114	best: 4.2791114 (40)	total: 474ms	remaining: 11.1s
60:	learn: 3.5173778	test: 3.5173778	test1: 3.8977534	best: 3.8977534 (60)	total: 689ms	remaining: 10.6s
80:	learn: 3.3037042	test: 3.3037042	test1: 3.7230522	best: 3.7230522 (80)	total: 906ms	remaining: 10.3s
100:	learn: 3.1419284	test: 3.1419284	test1: 3.6226548	best: 3.6226548 (100)	total: 1.15s	remaining: 10.2s
120:	learn: 3.0201356	test: 3.0201356	test1: 3.5697019	best: 3.5697019 (120)	total: 1.4s	remaining: 10.2s
140:	learn: 2.8980258	test: 2.8980258	test1: 3.5119852	best: 3.5119852 (140)	total: 1.64s	remaining: 9.99s
160:	learn: 2.7796087	test: 2.7796087	test1: 3.4573928	best: 3.4573928 (160)	total: 1.87s	remaining: 9.73s
180:	learn: 2.6737835	test: 2.6737835	test1: 3.4250067	best: 3.4222769 (179)	total: 2.11s	remaining: 9.53s
200:	learn: 2.5687932	test: 2.5687932	test1: 3



20:	learn: 5.1587391	test: 5.1587391	test1: 5.2583327	best: 5.2583327 (20)	total: 244ms	remaining: 11.4s
40:	learn: 3.9100940	test: 3.9100940	test1: 4.0725805	best: 4.0725805 (40)	total: 456ms	remaining: 10.7s
60:	learn: 3.4924024	test: 3.4924024	test1: 3.7661564	best: 3.7661564 (60)	total: 671ms	remaining: 10.3s
80:	learn: 3.2716895	test: 3.2716895	test1: 3.6517013	best: 3.6517013 (80)	total: 888ms	remaining: 10.1s
100:	learn: 3.1301418	test: 3.1301418	test1: 3.5852267	best: 3.5852267 (100)	total: 1.13s	remaining: 10.1s
120:	learn: 3.0042387	test: 3.0042387	test1: 3.5336790	best: 3.5336790 (120)	total: 1.38s	remaining: 10s
140:	learn: 2.8954298	test: 2.8954298	test1: 3.4898725	best: 3.4898725 (140)	total: 1.63s	remaining: 9.91s
160:	learn: 2.7892354	test: 2.7892354	test1: 3.4544884	best: 3.4544884 (160)	total: 1.87s	remaining: 9.74s
180:	learn: 2.6847128	test: 2.6847128	test1: 3.4072370	best: 3.4072370 (180)	total: 2.1s	remaining: 9.53s
200:	learn: 2.5818088	test: 2.5818088	test1: 3.3