In [42]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [43]:
train_data=pd.read_csv("train.csv")


In [44]:
train_data.shape

(1143, 74)

In [46]:
train_data.head()

Unnamed: 0,surveyid,village,femaleres,age,married,children,hhsize,edu,hh_children,hh_totalmembers,...,given_mpesa,amount_given_mpesa,received_mpesa,amount_received_mpesa,net_mpesa,saved_mpesa,amount_saved_mpesa,early_survey,day_of_week,depressed
0,926,91,1,28.0,1,4,6,10,0,,...,0,0.0,0,0.0,0.0,1,0.0,0,5,0
1,747,57,1,23.0,1,3,5,8,0,,...,0,0.0,1,4.804611,4.804611,0,0.0,0,3,1
2,1190,115,1,22.0,1,3,5,9,0,,...,0,0.0,0,8.007685,8.007685,1,0.0,0,5,0
3,1065,97,1,27.0,1,2,4,10,2,4.0,...,0,0.0,0,0.0,0.0,1,1.249199,0,0,0
4,806,42,0,59.0,0,4,6,10,4,6.0,...,0,0.0,0,0.0,0.0,0,0.0,0,3,0


In [4]:
train_data.columns.value_counts()

ent_wagelabor              1
med_portion_sickinjured    1
nondurable_investment      1
asset_land_owned_total     1
ent_ownfarm                1
ed_expenses                1
cons_tobacco               1
med_port_sick_child        1
hhsize                     1
med_expenses_child_ep      1
village                    1
hh_totalmembers            1
asset_phone                1
surveyid                   1
ent_farmrevenue            1
med_afford_port            1
ent_nonag_revenue          1
ent_employees              1
fs_chskipm_often           1
med_u5_deaths              1
cons_ed                    1
fs_chwholed_often          1
ed_work_act_pc             1
fs_enoughtom               1
labor_primary              1
ent_nonagbusiness          1
saved_mpesa                1
hh_children                1
cons_med_total             1
asset_livestock            1
                          ..
amount_received_mpesa      1
femaleres                  1
net_mpesa                  1
asset_niceroof

In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1143 entries, 0 to 1142
Data columns (total 74 columns):
surveyid                   1143 non-null int64
village                    1143 non-null int64
femaleres                  1143 non-null int64
age                        1143 non-null float64
married                    1143 non-null int64
children                   1143 non-null int64
hhsize                     1143 non-null int64
edu                        1143 non-null int64
hh_children                1143 non-null int64
hh_totalmembers            809 non-null float64
cons_nondurable            1143 non-null float64
asset_livestock            1143 non-null float64
asset_durable              1143 non-null float64
asset_phone                1143 non-null float64
asset_savings              1143 non-null float64
asset_land_owned_total     1143 non-null float64
asset_niceroof             1143 non-null int64
cons_allfood               1143 non-null float64
cons_ownfood               114

In [6]:
train_data.shape

(1143, 74)

In [7]:
# here is a function that calculates the number of missing values and the percentage of the total values 
#that are missing for each column.
def missing(df):
    #total missing valies
    missing_vals=df.isnull().sum()
    #%of missing values
    per_missing_vals=100*df.isnull().sum()/len(df)
    
    #making a table with missing values
    missing_vals_table=pd.concat([missing_vals,per_missing_vals],axis=1)
    
    #renaming the columns
    missing_vals_table_rename=missing_vals_table.rename(columns={0:"Missing Values",1:"total % of missing values"})
    
    #sorting above table in descending
    missing_vals_table_rename= missing_vals_table_rename[
            missing_vals_table_rename.iloc[:,1] != 0].sort_values(
        'total % of missing values', ascending=False).round(1)
    
    #printing summary information
    print ("The dataset has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(missing_vals_table_rename.shape[0]) +
              " columns that have missing values.")
    
    return missing_vals_table_rename
        


In [8]:
 missing(train_data)

The dataset has 74 columns.
There are 23 columns that have missing values.


Unnamed: 0,Missing Values,total % of missing values
med_u5_deaths,1084,94.8
med_expenses_sp_ep,878,76.8
med_expenses_hh_ep,693,60.6
med_expenses_child_ep,600,52.5
ed_work_act_pc,571,50.0
ed_sch_missedpc,467,40.9
ed_schoolattend,463,40.5
ed_expenses_perkid,463,40.5
ed_expenses,463,40.5
med_afford_port,423,37.0


In [9]:
#getting columns with more than 50% missing values and discard it.
missing_df = missing(train_data);
missing_columns = list(missing_df[missing_df['total % of missing values'] > 50].index)
print('\n','We will remove %d columns.' % len(missing_columns))

The dataset has 74 columns.
There are 23 columns that have missing values.

 We will remove 4 columns.


In [10]:
# Drop the columns
train_data = train_data.drop(columns = list(missing_columns))

In [11]:
missing(train_data)

The dataset has 70 columns.
There are 19 columns that have missing values.


Unnamed: 0,Missing Values,total % of missing values
ed_work_act_pc,571,50.0
ed_sch_missedpc,467,40.9
ed_schoolattend,463,40.5
ed_expenses_perkid,463,40.5
ed_expenses,463,40.5
med_healthconsult,423,37.0
med_afford_port,423,37.0
cons_med_children,419,36.7
fs_chskipm_often,416,36.4
fs_chwholed_often,416,36.4


In [12]:
#Creating an imputer object with a median filling strategy
from sklearn.preprocessing import Imputer

In [13]:
#X=train_data.drop(["depressed"],axis=1)
#y=train_data["depressed"]
X = train_data.iloc[:, 1:69].values
y = train_data.iloc[:, -1].values


In [14]:
imputer = Imputer(missing_values='NaN', strategy="most_frequent", axis=0)
imputer = imputer.fit(X[:, 0:68])
X[:, 0:68] = imputer.transform(X[:, 0:68])


In [15]:
from sklearn.model_selection import train_test_split

In [16]:
#splitting the data into training and testing data
X_train,X_test,y_train,y_test=train_test_split( X,y,test_size=0.33,random_state=42)

In [17]:

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier


In [18]:
import xgboost as xgb
from sklearn.grid_search import GridSearchCV

xgb_model = xgb.XGBClassifier()
optimization_dict = {'max_depth': [2,4,6],
                     'n_estimators': [50,100,200]}

model = GridSearchCV(xgb_model, optimization_dict, 
                     scoring='accuracy', verbose=1)

model.fit(X,y)
print(model.best_score_)
print(model.best_params_)



Fitting 3 folds for each of 9 candidates, totalling 27 fits


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:   15.9s finished


0.8250218722659668
{'max_depth': 2, 'n_estimators': 50}


In [19]:
classifier = XGBClassifier(
    learning_rate=0.1, max_depth=2, min_samples_split=100,n_estimators=50)
classifier.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=2, min_child_weight=1, min_samples_split=100,
       missing=None, n_estimators=50, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)

In [20]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)


  if diff:


In [21]:
from sklearn.metrics import confusion_matrix

In [22]:
cm = confusion_matrix(y_test, y_pred)

In [23]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(
    estimator=classifier, X=X_train, y=y_train, cv=10, n_jobs=1)
accuracies.mean()
accuracies.std()


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


0.012663788918950356

In [24]:
accuracy1 = accuracies.mean()
accuracy2 = accuracies.std()


In [25]:
# Fitting XGBoost to the Training set
classifier = XGBClassifier(
    learning_rate=0.1, max_depth=4, min_samples_split=100)
classifier.fit(X, y)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=4, min_child_weight=1, min_samples_split=100,
       missing=None, n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)

In [26]:
# Predicting the againist entire set
pred_train = classifier.predict(X)


  if diff:


In [27]:
# Applying k-Fold Cross Validation
accuracies = cross_val_score(estimator=classifier, X=X, y=y, cv=10, n_jobs=1)


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


In [28]:

accuracy3 = accuracies.mean()
accuracy4 = accuracies.std()


In [29]:
test_data=pd.read_csv("test.csv")

In [30]:
test_data.shape

(286, 74)

In [31]:
# Drop the columns
test_data=test_data.drop(["med_u5_deaths","med_expenses_sp_ep","med_expenses_hh_ep","med_expenses_child_ep"],axis=1)

In [32]:
test_data.shape

(286, 70)

In [33]:

test_X = test_data.iloc[:, 1:69].values


In [34]:
imputer = Imputer(missing_values='NaN', strategy="most_frequent", axis=0)
imputer = imputer.fit(test_X[:, 0:69])
test_X[:, 0:68] = imputer.transform(test_X[:, 0:68])


In [35]:
test_pred = classifier.predict(test_X)

  if diff:


In [36]:
test_data['depression_predict'] = test_pred

In [37]:
test_submit = test_data[['surveyid', 'depression_predict']]
test_submit1 = test_data[['surveyid', 'depression_predict']]
test_submit2 = test_data[['surveyid', 'depression_predict']]

In [38]:
test_submit1.head()

Unnamed: 0,surveyid,depression_predict
0,901,0
1,498,0
2,710,0
3,433,0
4,44,0


In [39]:
test_submit.to_csv('prediction1.csv', encoding='utf-8', index=False)


In [40]:
test_submit.to_csv('prediction2.csv', encoding='utf-8', index=False)
