# Pump it UP!
## Data loading and Exploration 

In [2]:
import numpy as np
from numpy import math
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
train_l = pd.read_csv("./train_labels.csv")

In [4]:
test_p = pd.read_csv("water_prepared.csv")

In [5]:
train = test_p.iloc[:, :-3] #remove the last 3 varaiables

In [6]:
train_1 = pd.merge(train,train_l, on = 'id')

# Handle NA's Train

In [7]:
#This should strip the empty spaces but I'm not sure if it happened
categorical_features = pd.DataFrame(train_1.describe(include = ['O'])).columns

for c in categorical_features:
    train_1.replace(" ", "")

In [8]:
#drop some columns we don't need or are duplicated
cols = ('Index','id','gps_height','wpt_name','subvillage','num_private','region',
            'lga','ward','public_meeting','recorded_by','scheme_name','extraction_type', 
            'extraction_type_group','management_group','payment','quality_group','quantity_group',
            'source_class','waterpoint_type','basin')
#check gps hight 
for c in cols :
    train_1.drop(c,inplace = True, axis = 1)

In [9]:
#Change date_Recorded to datetime
from datetime import datetime, timedelta

datedf = train_1['date_recorded']
train_1['date_recorded'] = pd.to_datetime(train_1['date_recorded'], yearfirst = True, format = '%Y-%m-%d')

## Hidden NA's and false 0's

In [10]:
#Which columns are suspects to have false 0's that are actually na's?
print((train_1 == 0).sum())

amount_tsh               41639
date_recorded                0
funder                       0
installer                    0
longitude                 1812
latitude                     0
region_code                  0
district_code               23
population               21381
scheme_management            0
permit                   38852
construction_year        20709
extraction_type_class        0
management                   0
payment_type                 0
water_quality                0
quantity                     0
source                       0
source_type                  0
waterpoint_type_group        0
status_group                 0
dtype: int64


In [11]:
#convert 0's to Na's
nas = ('construction_year', 'amount_tsh', 'longitude', 'district_code', 'permit', 'construction_year') 

for i in train_1.columns:
    if i in nas:
        train_1.replace(0, np.NaN, inplace = True)
        
## LOOP IS NOT WORKING. IT REMOVES ALL 0'S INSTEAD OF THE ONES IN THE NA LIST ONLY

In [12]:
#Which columns are suspects to have false 0's that are actually na's?
print((train_1 == 0).sum())

amount_tsh               0
date_recorded            0
funder                   0
installer                0
longitude                0
latitude                 0
region_code              0
district_code            0
population               0
scheme_management        0
permit                   0
construction_year        0
extraction_type_class    0
management               0
payment_type             0
water_quality            0
quantity                 0
source                   0
source_type              0
waterpoint_type_group    0
status_group             0
dtype: int64


In [13]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
query = 'WORLDBANK'
choices = train_1['installer']
# Get a list of matches ordered by score, default limit to 5
process.extract(query, choices)

[('WORLDBANK', 100, 141),
 ('WORLDBANK', 100, 280),
 ('WORLDBANK', 100, 304),
 ('WORLDBANK', 100, 349),
 ('WORLDBANK', 100, 399)]

In [14]:
from difflib import SequenceMatcher as SM
SM(None, query, choices).ratio()

0.0001346597316904846

## Incomplete Cases and missing values

In [15]:
objects = []
for i in train_1.columns:
    if train_1[i].dtype == object:
        objects.append(i)

train_1.update(train_1[objects].fillna('None'))

nulls = np.sum(train_1.isnull())
nullcols = nulls.loc[(nulls != 0)]
dtypes = train_1.dtypes
dtypes2 = dtypes.loc[(nulls != 0)]
info = pd.concat([nullcols, dtypes2], axis=1).sort_values(by=0, ascending=False)
print(info)
print("There are", len(nullcols), "columns with missing values")

                       0        1
permit             41908  float64
amount_tsh         41639  float64
population         21381  float64
construction_year  20709  float64
longitude           1812  float64
district_code         23  float64
There are 6 columns with missing values


In [16]:
#create object with datatype categorical
objects3 = []
for i in train_1.columns:
    if train_1[i].dtype == 'object':
        objects3.append(i)
        
print("Training Set incomplete cases")

sums_train_1 = train_1[objects3].apply(lambda x: len(np.unique(x)))
sums_train_1.sort_values(ascending=False)

Training Set incomplete cases


installer                1903
funder                   1889
scheme_management          13
management                 12
source                     10
source_type                 7
payment_type                7
extraction_type_class       7
waterpoint_type_group       6
quantity                    5
status_group                3
water_quality               2
dtype: int64

In [17]:
#create object with datatype integer and check incomplete cases
num1 = []
numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
for i in train_1.columns:
    if train_1[i].dtype in numeric_dtypes:
        num1.append(i)
        
print("Training Set incomplete cases")

sums_train_1 = train_1[num1].apply(lambda x: len(np.unique(x)))
sums_train_1.sort_values(ascending=False)

Training Set incomplete cases


longitude            59327
latitude             57517
permit               41909
amount_tsh           41736
population           22429
construction_year    20763
district_code           42
region_code             27
dtype: int64

### Remove skewness and imputation

In [18]:
from scipy.stats import skew

numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerics2 = []
for i in train_1.columns:
    if train_1[i].dtype in numeric_dtypes:
        numerics2.append(i)
        
skew_train_1 = train_1[numerics2].apply(lambda x: skew(x)).sort_values(ascending=False)
skews = pd.DataFrame({'skew': skew_train_1})
skews

Unnamed: 0,skew
region_code,3.173738
latitude,-0.152033
amount_tsh,
longitude,
district_code,
population,
permit,
construction_year,


In [19]:
#Check percentages of NA's
total = train_1.isnull().sum().sort_values(ascending=False)
percent = (train_1.isnull().sum()/train_1.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total,percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(10)

Unnamed: 0,Total,Percent
permit,41908,0.705522
amount_tsh,41639,0.700993
population,21381,0.359949
construction_year,20709,0.348636
longitude,1812,0.030505
district_code,23,0.000387
source_type,0,0.0
date_recorded,0,0.0
funder,0,0.0
installer,0,0.0


In [20]:
#Impute NA's with median
train_1['construction_year'].fillna(train_1['construction_year'].mean(), inplace=True)
#Total static head (amount water available to waterpoint)
train_1['amount_tsh'].fillna(train_1['amount_tsh'].mean(), inplace=True)


In [21]:
#from fancyimpute import KNN
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
newdf = train_1.select_dtypes(include=numerics)

        # I now run fancyimpute KNN - it returns a np.array which I store as a pandas dataframe
#df_complete = pd.DataFrame(KNN(3).complete(newdf))
#df_complete.to_csv('clean_numerics.csv')

df_complete = pd.read_csv('clean_numerics.csv')

In [22]:
#Rearrange dataset
df_complete.columns = newdf.columns
df_complete.index = newdf.index

#replace values in original df
train_1['amount_tsh'] = df_complete['amount_tsh'].values
train_1['longitude'] = df_complete['longitude'].values
train_1['latitude'] = df_complete['latitude'].values
train_1['region_code'] = df_complete['region_code'].values
train_1['district_code'] = df_complete['district_code'].values
train_1['population'] = df_complete['population'].values
train_1['permit'] = df_complete['permit'].values
train_1['construction_year'] = df_complete['construction_year'].values

#Sanity check
total = train_1.isnull().sum().sort_values(ascending=False)
percent = (df_complete.isnull().sum()/df_complete.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total,percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(15)

Unnamed: 0,Total,Percent
amount_tsh,0,0.0
construction_year,0,0.0
date_recorded,0,
district_code,0,0.0
extraction_type_class,0,
funder,0,
installer,0,
latitude,0,0.0
longitude,0,0.0
management,0,


## Feature Engineering

features['Total_sqr_footage'] = (features['BsmtFinSF1'] + features['BsmtFinSF2'] +
                                 features['1stFlrSF'] + features['2ndFlrSF'])

features['Total_Bathrooms'] = (features['FullBath'] + (0.5*features['HalfBath']) + 
                               features['BsmtFullBath'] + (0.5*features['BsmtHalfBath']))

features['Total_porch_sf'] = (features['OpenPorchSF'] + features['3SsnPorch'] +
                              features['EnclosedPorch'] + features['ScreenPorch'] +
                             features['WoodDeckSF'])


#simplified features
features['haspool'] = features['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
features['has2ndfloor'] = features['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
features['hasgarage'] = features['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
features['hasbsmt'] = features['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
features['hasfireplace'] = features['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

## Test

In [23]:
test = pd.read_csv("test_prepared.csv")

In [24]:
ID = test['id']

In [25]:
ID.shape

(14850,)

In [26]:
#remove vars from test dataset
cols = ('id','gps_height','wpt_name','subvillage','num_private','region',
            'lga','ward','public_meeting','recorded_by','scheme_name','extraction_type', 
            'extraction_type_group','management_group','payment','quality_group','quantity_group',
            'source_class','waterpoint_type', 'basin')

for c in cols :
    test.drop(c,inplace = True, axis = 1)

In [27]:
#Change date_Recorded to datetime
from datetime import datetime, timedelta

datedf = test['date_recorded']
test['date_recorded'] = pd.to_datetime(test['date_recorded'], yearfirst = True, format = '%Y-%m-%d')

In [28]:
#summary of datasets
print('Train Data: \n')
print("Number of columns: "+ str(train_1.shape[1]))
print("Number of rows: "+ str(train_1.shape[0]))
print('\nTest Data: \n')
print("Number of columns: "+ str(test.shape[1]))
print("Number of rows: "+ str(test.shape[0]))

Train Data: 

Number of columns: 21
Number of rows: 59400

Test Data: 

Number of columns: 20
Number of rows: 14850


### Na Check

In [29]:
#Which columns are suspects to have false 0's that are actually na's?
print((test == 0).sum())

amount_tsh               10410
date_recorded                0
funder                       0
installer                    0
longitude                  457
latitude                     0
region_code                  0
district_code                4
population                5453
scheme_management            0
permit                    9754
construction_year         5260
extraction_type_class        0
management                   0
payment_type                 0
water_quality                0
quantity                     0
source                       0
source_type                  0
waterpoint_type_group        0
dtype: int64


In [30]:
#convert 0's to Na's
nas = ('construction_year', 'amount_tsh', 'longitude', 'district_code', 'permit', 'construction_year') 

for i in test.columns:
    if i in nas:
        test.replace(0, np.NaN, inplace = True)

In [31]:
objects = []
for i in test.columns:
    if test[i].dtype == object:
        objects.append(i)

test.update(test[objects].fillna('None'))
nulls = np.sum(test.isnull())
nullcols = nulls.loc[(nulls != 0)]
dtypes = test.dtypes
dtypes2 = dtypes.loc[(nulls != 0)]
info = pd.concat([nullcols, dtypes2], axis=1).sort_values(by=0, ascending=False)
print(info)
print("There are", len(nullcols), "columns with missing values")

                       0        1
permit             10491  float64
amount_tsh         10410  float64
population          5453  float64
construction_year   5260  float64
longitude            457  float64
district_code          4  float64
There are 6 columns with missing values


In [32]:
#create object with datatype categorical
objects3 = []
for i in test.columns:
    if test[i].dtype == 'object':
        objects3.append(i)
        
print("Training Set incomplete cases")

sums_test = test[objects3].apply(lambda x: len(np.unique(x)))
sums_test.sort_values(ascending=False)

Training Set incomplete cases


funder                   976
installer                910
management                12
scheme_management         12
source                    10
source_type                7
payment_type               7
extraction_type_class      7
waterpoint_type_group      6
quantity                   5
water_quality              2
dtype: int64

In [33]:
#create object with datatype integer and check incomplete cases
num1 = []
numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
for i in test.columns:
    if test[i].dtype in numeric_dtypes:
        num1.append(i)
print("Training distinct cases")
sums_test = test[num1].apply(lambda x: len(np.unique(x)))
sums_test.sort_values(ascending=False)

Training distinct cases


longitude            14846
latitude             14390
permit               10492
amount_tsh           10477
population            6089
construction_year     5314
region_code             26
district_code           23
dtype: int64

In [34]:
numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerics2 = []
for i in test.columns:
    if test[i].dtype in numeric_dtypes:
        numerics2.append(i)
skew_test = test[numerics2].apply(lambda x: skew(x)).sort_values(ascending=False)
skews = pd.DataFrame({'skew': skew_test})
skews

Unnamed: 0,skew
region_code,3.20085
latitude,-0.156223
amount_tsh,
longitude,
district_code,
population,
permit,
construction_year,


In [35]:
#Check percentages of NA's
total = test.isnull().sum().sort_values(ascending=False)
percent = (test.isnull().sum()/test.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total,percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(10)

Unnamed: 0,Total,Percent
permit,10491,0.706465
amount_tsh,10410,0.70101
population,5453,0.367205
construction_year,5260,0.354209
longitude,457,0.030774
district_code,4,0.000269
date_recorded,0,0.0
funder,0,0.0
installer,0,0.0
latitude,0,0.0


In [36]:
#Impute NA's with median
test['construction_year'].fillna(test['construction_year'].mean(), inplace=True)
#Total static head (amount water available to waterpoint)
test['amount_tsh'].fillna(test['amount_tsh'].mean(), inplace=True)


In [37]:
#clean data for test
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
newdf = test.select_dtypes(include=numerics)
#df_complete = pd.DataFrame(KNN(3).complete(newdf))

In [38]:
#df_complete.to_csv('clean_numerics_test.csv')
df_complete = pd.read_csv('clean_numerics_test.csv')
#Rearrange dataset
df_complete.columns = newdf.columns
df_complete.index = newdf.index

#replace values in original df
test['amount_tsh'] = df_complete['amount_tsh'].values
test['longitude'] = df_complete['longitude'].values
test['latitude'] = df_complete['latitude'].values
test['region_code'] = df_complete['region_code'].values
test['district_code'] = df_complete['district_code'].values
test['population'] = df_complete['population'].values
test['permit'] = df_complete['permit'].values
test['construction_year'] = df_complete['construction_year'].values

#Sanity check
total = test.isnull().sum().sort_values(ascending=False)
percent = (df_complete.isnull().sum()/df_complete.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total,percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(15)

Unnamed: 0,Total,Percent
amount_tsh,0,0.0
construction_year,0,0.0
date_recorded,0,
district_code,0,0.0
extraction_type_class,0,
funder,0,
installer,0,
latitude,0,0.0
longitude,0,0.0
management,0,


## Label Encoding

In [39]:
#Check categoricals
categorical_features = pd.DataFrame(train_1.describe(include = ['O'])).columns
print('Categorical features: \n')
print(str(categorical_features))

categorical_features = pd.DataFrame(test.describe(include = ['O'])).columns
print('Categorical features: \n')
print(str(categorical_features))

Categorical features: 

Index(['funder', 'installer', 'scheme_management', 'extraction_type_class',
       'management', 'payment_type', 'water_quality', 'quantity', 'source',
       'source_type', 'waterpoint_type_group', 'status_group'],
      dtype='object')
Categorical features: 

Index(['funder', 'installer', 'scheme_management', 'extraction_type_class',
       'management', 'payment_type', 'water_quality', 'quantity', 'source',
       'source_type', 'waterpoint_type_group'],
      dtype='object')


In [40]:
#for column in ['payment_type', 'quantity'] :
    #dummies = pd.get_dummies(train_1[column])
    #train_1[dummies.columns] = dummies
    
#for column in ['payment_type', 'quantity'] :
    #dummies = pd.get_dummies(test[column])
    #test[dummies.columns] = dummies

In [41]:
from sklearn.preprocessing import LabelEncoder

#labelencoding train
from sklearn.preprocessing import LabelEncoder

for col in ('payment_type', 'quantity','scheme_management','extraction_type_class','water_quality', 'quantity', 'waterpoint_type_group', 'status_group', 'source', 'source_type', 
            'management', 'funder', 'installer'):
    lbl = LabelEncoder()
    lbl.fit(list(train_1[col].values)) 
    train_1[col] = lbl.transform(list(train_1[col].values))

#labelencoding test

for col in ('payment_type', 'quantity','scheme_management','extraction_type_class','water_quality', 'quantity', 'waterpoint_type_group', 'source', 'source_type', 'management', 
           'funder', 'installer'):
    lbl = LabelEncoder() 
    lbl.fit(list(test[col].values)) 
    test[col] = lbl.transform(list(test[col].values))

In [42]:
#summary of datasets
print('Train Data: \n')
print("Number of columns: "+ str(train_1.shape[1]))
print("Number of rows: "+ str(train_1.shape[0]))
print('\nTest Data: \n')
print("Number of columns: "+ str(test.shape[1]))
print("Number of rows: "+ str(test.shape[0]))

Train Data: 

Number of columns: 21
Number of rows: 59400

Test Data: 

Number of columns: 20
Number of rows: 14850


# Modeling

In [43]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score

In [44]:
#Take out y
y = train_1['status_group']

In [45]:
train = train_1.drop(['status_group', 'date_recorded', 'latitude', 'longitude', 'permit'], axis = 1) 
#had to drop date_recorded because the tree does not allow for datetime values
train = pd.DataFrame(train)
train.dropna
train.shape


(59400, 16)

## Note from Natasha:

Not sure if we can actually drop values from the test dataset.

In [46]:
test = test.drop(['date_recorded', 'latitude', 'longitude', 'permit'], axis = 1) 
test = pd.DataFrame(test)
test.dropna
test.shape

(14850, 16)

In [47]:
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size = 0.30, random_state = 1)
print("X_train : " + str(X_train.shape))
print("X_test : " + str(X_test.shape))
print("y_train : " + str(y_train.shape))
print("y_test : " + str(y_test.shape))

X_train : (41580, 16)
X_test : (17820, 16)
y_train : (41580,)
y_test : (17820,)


## Model 2

In [48]:
from sklearn.metrics import roc_auc_score
def GS(a,b):
    """Function that received two parameters; first: a binary variable representing 0=good and 1=bad, and then a second variable with the prediction of the first variable, the second variable can be continuous, integer or binary - continuous is better. Finally, the function returns the GINI Coefficient of the two lists."""    
    from sklearn.metrics import accuracy_score, auc, confusion_matrix, f1_score, precision_score, recall_score, roc_curve
    false_positive_rate, recall, thresholds = roc_curve(a, b)
    roc_auc = auc(false_positive_rate, recall)
    gini = 2*roc_auc_score(a,b)-1
    return gini

def KS(b,a):  
    """Function that received two parameters; first: a binary variable representing 0=good and 1=bad, and then a second variable with the prediction of the first variable, the second variable can be continuous, integer or binary - continuous is better. Finally, the function returns the KS Statistics of the two lists."""
    try:
        tot_bads=1.0*sum(b)
        tot_goods=1.0*(len(b)-tot_bads)
        elements = zip(*[a,b])
        elements = sorted(elements,key= lambda x: x[0])
        elements_df = pd.DataFrame({'probability': b,'gbi': a})
        pivot_elements_df = pd.pivot_table(elements_df, values='probability', index=['gbi'], aggfunc=[sum,len]).fillna(0)
        max_ks = perc_goods = perc_bads = cum_perc_bads = cum_perc_goods = 0
        for i in range(len(pivot_elements_df)):
            perc_goods =  (pivot_elements_df.iloc[i]['len'] - pivot_elements_df.iloc[i]['sum']) / tot_goods
            perc_bads = pivot_elements_df.iloc[i]['sum']/ tot_bads
            cum_perc_goods += perc_goods
            cum_perc_bads += perc_bads
            A = cum_perc_bads-cum_perc_goods
            if abs(A['probability']) > max_ks:
                max_ks = abs(A['probability'])
    except:
        max_ks = 0
    return max_ks

In [49]:
def train_method(X_train, y_train, X_test, y_test, method):  
    """ Function to redirect call to the desired method """
    if method == 'DT': # Decision Tree Classifier
        return DT(X_train, y_train, X_test, y_test)    

    elif method == 'RFC': # Random Forest Classifier
        return RFC(X_train, y_train, X_test, y_test)    

In [50]:
from sklearn.tree import DecisionTreeClassifier
def DT(X_train, Y_train, X_test, Y_test):
    """
    Decision Tree Classifier
    """
    logr = DecisionTreeClassifier(min_samples_split=20, random_state=99).fit(X_train, Y_train)
    y_pred_train = logr.predict(X_train)
    y_pred_test = logr.predict(X_test)
    a = logr.score(X_test, Y_test)
    return {'model':logr ,'accuracy':a }

In [51]:
from sklearn.ensemble import RandomForestClassifier
def RFC(X_train, y_train, X_test, y_test):
    """
    Random Forest Classifier
    """
    rfr = RandomForestClassifier(n_estimators=1000, min_samples_split=2).fit(X_train, y_train)
    y_pred_train = rfr.predict(X_train)
    y_pred_test = rfr.predict(X_test)
    a = rfr.score(X_test, y_test)
    return {'model':rfr,'accuracy':a }

In [52]:
#Decision Random Forest
dict_trained_model = train_method(X_train, y_train, X_test, y_test,"RFC")
print(dict_trained_model)

{'model': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'accuracy': 0.7872053872053872}


In [60]:
from sklearn import tree
import graphviz 
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
dot_data = tree.export_graphviz(clf, out_file=None) 
graph = graphviz.Source(dot_data) 
graph.render("iris")

'iris.pdf'

In [None]:
#Decision Tree
dict_trained_model = train_method(X_train, y_train, X_test, y_test,"DT")
print(dict_trained_model)

In [381]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
X = X_train
y = y_train
clf = QuadraticDiscriminantAnalysis()
clf.fit(X, y)
pred1 = clf.predict(test)

In [382]:
df = pd.read_csv('SubmissionFormat.csv')
predictions = pd.DataFrame(data=pred1, index=None, columns=['status_group'], dtype=None)
#ID = pd.DataFrame(data=ID, index=None, columns=['id'], dtype=None)
comb = pd.concat([df['id'], predictions], axis=1)

comb['status_group'] = comb['status_group'].replace([0], 'functional')
comb['status_group'] = comb['status_group'].replace([1], 'functional needs repair')
comb['status_group'] = comb['status_group'].replace([2], 'non functional')

comb.to_csv('output_1.csv')

In [384]:
comb.head(10)

Unnamed: 0,id,status_group
0,50785,non functional
1,51630,functional
2,17168,non functional
3,45559,non functional
4,49871,functional
5,52449,functional
6,24806,non functional
7,28965,non functional
8,36301,non functional
9,54122,functional


In [389]:
from sklearn.svm import NuSVC
clf = NuSVC(0)
clf.fit(X_train, y_train) 
print(clf.predict(train))

ValueError: nu <= 0 or nu > 1

## On Test Dataset

In [None]:
rfr = RandomForestClassifier(n_estimators=1000, min_samples_split=2).fit(X_train, y_train)
y_pred_train = rfr.predict(X_train)
pred = y_pred_test = rfr.predict(test)
#print('accuracy:', a)

## Join the two into submission format

In [None]:
df = pd.read_csv('SubmissionFormat.csv')
predictions = pd.DataFrame(data=pred, index=None, columns=['status_group'], dtype=None)
#ID = pd.DataFrame(data=ID, index=None, columns=['id'], dtype=None)
comb = pd.concat([df['id'], predictions], axis=1)

comb['status_group'] = comb['status_group'].replace([0], 'functional')
comb['status_group'] = comb['status_group'].replace([1], 'functional needs repair')
comb['status_group'] = comb['status_group'].replace([2], 'non functional')

comb.to_csv('output.csv')

## Bagging meta-estimator

In ensemble algorithms, bagging methods form a class of algorithms which build several instances of a black-box estimator on random subsets of the original training set and then aggregate their individual predictions to form a final prediction. These methods are used as a way to reduce the variance of a base estimator (e.g., a decision tree), by introducing randomization into its construction procedure and then making an ensemble out of it. In many cases, bagging methods constitute a very simple way to improve with respect to a single model, without making it necessary to adapt the underlying base algorithm. As they provide a way to reduce overfitting, bagging methods work best with strong and complex models (e.g., fully developed decision trees), in contrast with boosting methods which usually work best with weak models (e.g., shallow decision trees).


http://scikit-learn.org/stable/modules/ensemble.html

## Extremely randomised trees - did not perform well 

In extremely randomized trees (see ExtraTreesClassifier and ExtraTreesRegressor classes), randomness goes one step further in the way splits are computed. As in random forests, a random subset of candidate features is used, but instead of looking for the most discriminative thresholds, thresholds are drawn at random for each candidate feature and the best of these randomly-generated thresholds is picked as the splitting rule. This usually allows to reduce the variance of the model a bit more, at the expense of a slightly greater increase in bias:

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_blobs
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier

X, y = make_blobs(n_samples=50000, n_features=16, centers=100, random_state=0)
clf = DecisionTreeClassifier(max_depth=None, min_samples_split=6, random_state=0)
clf.fit(X_train, y_train, sample_weight=None, check_input=True, X_idx_sorted=None)
scores = cross_val_score(clf, X_train, y_train)
scores.mean()                                                         

In [None]:
y_pred_train = rfr.predict(X_train)
pred = y_pred_test = clf.predict(test)
predictions = pd.DataFrame(data=pred, index=None, columns=['status_group'], dtype=None)
#ID = pd.DataFrame(data=ID, index=None, columns=['id'], dtype=None)
comb = pd.concat([df['id'], predictions], axis=1)

comb['status_group'] = comb['status_group'].replace([0], 'functional')
comb['status_group'] = comb['status_group'].replace([1], 'functional needs repair')
comb['status_group'] = comb['status_group'].replace([2], 'non functional')

comb.to_csv('output_clf.csv')

In [None]:
clf = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0)
scores = cross_val_score(clf, X_test, y_test)
scores.mean() 

In [None]:
clf = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0)
scores = cross_val_score(clf, X_test, y_test)
scores.mean() > 0.78 #compare to random forest