##Import and Fetch

In [1]:
!pip install pandas_profiling
!pip install category_encoders



In [0]:
%matplotlib inline
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

import category_encoders as ce
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score


In [3]:
!wget https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Linear-Models/master/data/tanzania/train_features.csv
!wget https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Linear-Models/master/data/tanzania/train_labels.csv
!wget https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Linear-Models/master/data/tanzania/test_features.csv
!wget https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Linear-Models/master/data/tanzania/sample_submission.csv

--2019-07-23 04:16:05--  https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Linear-Models/master/data/tanzania/train_features.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 20054664 (19M) [text/plain]
Saving to: ‘train_features.csv.6’


2019-07-23 04:16:06 (154 MB/s) - ‘train_features.csv.6’ saved [20054664/20054664]

--2019-07-23 04:16:06--  https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Linear-Models/master/data/tanzania/train_labels.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1148327 (1.1M) [text/plain]
Savin

In [0]:
import pandas_profiling as pp

In [5]:
!ls

sample_data		 test_features.csv.2   train_features.csv.5
sample_submission.csv	 test_features.csv.3   train_features.csv.6
sample_submission.csv.1  test_features.csv.4   train_labels.csv
sample_submission.csv.2  test_features.csv.5   train_labels.csv.1
sample_submission.csv.3  test_features.csv.6   train_labels.csv.2
sample_submission.csv.4  train_features.csv    train_labels.csv.3
sample_submission.csv.5  train_features.csv.1  train_labels.csv.4
sample_submission.csv.6  train_features.csv.2  train_labels.csv.5
test_features.csv	 train_features.csv.3  train_labels.csv.6
test_features.csv.1	 train_features.csv.4


In [0]:
#load dataframes
X = pd.read_csv('train_features.csv')
Y = pd.read_csv('train_labels.csv')

Y = Y.status_group

#Split data
X_train, X_val, y_train, y_val = train_test_split(X, Y,random_state = 42,stratify = Y)
    

In [0]:
#pp.ProfileReport(X)

##Baseline

In [8]:
#Baseline
y_train.value_counts(normalize = True)

functional                 0.543075
non functional             0.384242
functional needs repair    0.072682
Name: status_group, dtype: float64

In [9]:
#get nums from df
numericals = X.select_dtypes('number').columns.to_list()
print(numericals)

['id', 'amount_tsh', 'gps_height', 'longitude', 'latitude', 'num_private', 'region_code', 'district_code', 'population', 'construction_year']


In [10]:
model = LogisticRegression(solver = 'lbfgs', multi_class = 'auto', max_iter = 350)
model.fit(X_train[numericals],y_train)
model.score(X_val[numericals], y_val)



0.5484848484848485

In [11]:
#plot coefficent values for each label
fig = plt.figure(figsize = (30,5), )

ax = fig.add_subplot(131)
coeffs = pd.Series(model.coef_[0], numericals)
ax = coeffs.sort_values().plot.barh()
ax.set_title(model.classes_[0], color = 'g')

ax1 = fig.add_subplot(132)
coeffs = pd.Series(model.coef_[1], numericals)
ax1 = coeffs.sort_values().plot.barh()
ax1.set_title(model.classes_[1], color = 'y')

ax2 = fig.add_subplot(133)
coeffs = pd.Series(model.coef_[2], numericals)
ax2 = coeffs.sort_values().plot.barh()
ax2.set_title(model.classes_[2], color = 'r')


Text(0.5, 1.0, 'non functional')

It looks like longitude and region code have some of the largest effects on the model based on the coefficients

##Cleaning


In [12]:
#Check if there are any coordinates in data that are outside of tanzania

#28 is west most and 42 is east most boundary  
print(X[(X.longitude <28) | (X.longitude > 42)].shape[0])

#0 is top most and -12 is lowest boundary
print(X[(X.latitude > 0) | (X.latitude < -12)].shape[0]) 

1812
0


In [0]:
    #get the median elevations for each region in df
def get_elevations(x):
#store each region and gps height in dict
  elevations = {}
  for region in x.region.unique():
    elevations[region] = x[x.region == region]['gps_height'].median()

#These looks to be the problem areas. The median values for these regions should not be zeo
  zero_regions = ['Dodoma', 'Kagera', 'Mbeya', 'Mwanza', 'Shinyanga', 'Tabora']

#I looked up the elevations and imputed the data
  elevations['Dodoma'] = 1118
  elevations['Kagera'] = 1500
  elevations['Mbeya'] = 1700
  elevations['Mwanza'] = 1140
  elevations['Shinyanga'] = 1128
  elevations['Tabora'] = 1191

#Use new values to clean zeroes in df based on imputed values
  for key in elevations:
    if key in zero_regions:
      x.loc[x.region == key, 'gps_height'] = elevations[key]
  
  return x

In [0]:
def clean(df):
  x = df.copy()
  
#Clean zeroes, I will infer zero equate to missing data that cant be imputed
  zeroes = ['construction_year', 'longitude', 'population']
  for col in zeroes:
    x[col].replace(to_replace = 0, value = np.NaN, inplace= True)  

#eliminate null island values
  x['latitude'] = x['latitude'].replace(-2e-08, np.nan)
  x['longitude'] = x['longitude'].replace(-2e-08, np.nan)

#For consistency, replace latitude values with NaN where longitude contains NaN
  x['latitude'] = np.where(x.longitude.isnull(), np.NaN, x.latitude)
  
#reassgin as datetime object
  x.date_recorded = pd.to_datetime(x.date_recorded, infer_datetime_format= True)

#impute gps data
  x = get_elevations(x)

#drop items both high card and duplicates
  dropping = ['quantity_group', 'recorded_by','subvillage', 'wpt_name',
              'management_group']  
  x = x.drop(dropping, axis = 1)

#reassign bool values
  x['permit'] = np.where(x.permit == True , 1,0)
  x['public_meeting'] = np.where(x.public_meeting == True , 1,0)

  return x


In [0]:
X = clean(X)

##Preliminary Models

In [0]:
#encode objects
for col in X.select_dtypes('object').columns.tolist():
  if col == 'date_recorded':
    continue
  #One hot encode low card data
  elif len(X[col].unique()) <=10:
    encode = ce.OneHotEncoder()
    X = pd.concat([X, encode.fit_transform(X[col].astype('str'))], axis = 1,)
    X= X.drop(col, axis = 1)
  #Label encode high card data  
  else:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype('str'))  
    


In [0]:
#Pop dates and scale data
dates = X.pop('date_recorded')

X = pd.concat([X,Y], axis = 1)
X = X.dropna()

Y= X.pop('status_group')

X = StandardScaler().fit_transform(X)

In [0]:
def select_model(x, y):
  #Empty list for modeling,recording, and displaying results
  models = []
  names = []
  results = []
  
  #select models
  models.append(('dt',DecisionTreeClassifier()))
  models.append(('gbm',GradientBoostingClassifier()))
  models.append(('knn',KNeighborsClassifier()))
  
  #test and score each model
  for name,model in models:
    result = cross_val_score(model, x, y, cv=3,)
    names.append(name)
    results.append(result)
  
  #display results
  for i in range(len(names)):
    print(names[i], results[i].mean())

In [0]:
#Split data
X_train, X_val, y_train, y_val = train_test_split(X, Y,random_state = 42,stratify = Y)

select_model(X_train, y_train)

In [0]:
X = pd.concat([X,Y], axis = 1)

In [0]:
X.head()