##Import and Fetch

In [1]:
!pip install pandas_profiling
!pip install category_encoders



In [0]:
%matplotlib inline
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

import category_encoders as ce
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split



In [3]:
!wget https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Linear-Models/master/data/tanzania/train_features.csv
!wget https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Linear-Models/master/data/tanzania/train_labels.csv
!wget https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Linear-Models/master/data/tanzania/test_features.csv
!wget https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Linear-Models/master/data/tanzania/sample_submission.csv

--2019-07-21 19:49:58--  https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Linear-Models/master/data/tanzania/train_features.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 20054664 (19M) [text/plain]
Saving to: ‘train_features.csv.5’


2019-07-21 19:49:58 (151 MB/s) - ‘train_features.csv.5’ saved [20054664/20054664]

--2019-07-21 19:49:59--  https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Linear-Models/master/data/tanzania/train_labels.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1148327 (1.1M) [text/plain]
Savin

In [0]:
import pandas_profiling as pp

In [5]:
!ls

sample_data		 test_features.csv.2   train_features.csv.5
sample_submission.csv	 test_features.csv.3   train_labels.csv
sample_submission.csv.1  test_features.csv.4   train_labels.csv.1
sample_submission.csv.2  test_features.csv.5   train_labels.csv.2
sample_submission.csv.3  train_features.csv    train_labels.csv.3
sample_submission.csv.4  train_features.csv.1  train_labels.csv.4
sample_submission.csv.5  train_features.csv.2  train_labels.csv.5
test_features.csv	 train_features.csv.3
test_features.csv.1	 train_features.csv.4


In [0]:
#load dataframes
X = pd.read_csv('train_features.csv')
Y = pd.read_csv('train_labels.csv')

Y = Y.status_group

#Split data
X_train, X_val, y_train, y_val = train_test_split(X, Y,random_state = 42,stratify = Y)
    

In [0]:
#pp.ProfileReport(X)

##Baseline

In [8]:
#Baseline
y_train.value_counts(normalize = True)

functional                 0.543075
non functional             0.384242
functional needs repair    0.072682
Name: status_group, dtype: float64

In [9]:
#get nums from df
numericals = X.select_dtypes('number').columns.to_list()
print(numericals)

['id', 'amount_tsh', 'gps_height', 'longitude', 'latitude', 'num_private', 'region_code', 'district_code', 'population', 'construction_year']


In [10]:
model = LogisticRegression(solver = 'lbfgs', multi_class = 'auto', max_iter = 350)
model.fit(X_train[numericals],y_train)
model.score(X_val[numericals], y_val)



0.5484848484848485

In [11]:
#plot coefficent values for each label
fig = plt.figure(figsize = (30,5), )

ax = fig.add_subplot(131)
coeffs = pd.Series(model.coef_[0], numericals)
ax = coeffs.sort_values().plot.barh()
ax.set_title(model.classes_[0], color = 'g')

ax1 = fig.add_subplot(132)
coeffs = pd.Series(model.coef_[1], numericals)
ax1 = coeffs.sort_values().plot.barh()
ax1.set_title(model.classes_[1], color = 'y')

ax2 = fig.add_subplot(133)
coeffs = pd.Series(model.coef_[2], numericals)
ax2 = coeffs.sort_values().plot.barh()
ax2.set_title(model.classes_[2], color = 'r')


Text(0.5, 1.0, 'non functional')

It looks like longitude and region code have some of the largest effects on the model based on the coefficients

##Cleaning


In [12]:
#Check if there are any coordinates in data that are outside of tanzania

#28 is west most and 42 is east most boundary  
print(X[(X.longitude <28) | (X.longitude > 42)].shape[0])

#0 is top most and -12 is lowest boundary
print(X[(X.latitude > 0) | (X.latitude < -12)].shape[0]) 

1812
0


In [0]:
#Clean zeroes, I will infer zero equate to missing data that cant be imputed
#There is no clean way to input these values without adding bias
zeroes = ['construction_year', 'longitude', 'population']
for col in zeroes:
  X[col].replace(to_replace = 0, value = np.NaN, inplace= True)  

In [15]:
X[zeroes].isnull().sum()

construction_year    20709
longitude             1812
population           21381
dtype: int64

In [0]:
#eliminate null island values
X['latitude'] = X['latitude'].replace(-2e-08, np.nan)
X['longitude'] = X['longitude'].replace(-2e-08, np.nan)

#For consistency, replace latitude values with NaN where longitude contains NaN
X['latitude'] = np.where(X.longitude.isnull(), np.NaN, X.latitude)

In [17]:
X[['latitude', 'longitude']].isnull().sum()

latitude     1812
longitude    1812
dtype: int64

In [18]:
#get and print the median elevations for each region in df
elevations={}

#Store regions for possible use later?
regions = []

for region in X.region.unique():
  regions.append(region)
  elevations[region] = X[X.region == region]['gps_height'].median()

elevations

{'Arusha': 1401.0,
 'Dar es Salaam': 29.0,
 'Dodoma': 0.0,
 'Iringa': 1719.0,
 'Kagera': 0.0,
 'Kigoma': 1274.0,
 'Kilimanjaro': 1241.0,
 'Lindi': 233.0,
 'Manyara': 1474.0,
 'Mara': 1295.0,
 'Mbeya': 0.0,
 'Morogoro': 325.0,
 'Mtwara': 285.5,
 'Mwanza': 0.0,
 'Pwani': 45.0,
 'Rukwa': 1484.5,
 'Ruvuma': 924.5,
 'Shinyanga': 0.0,
 'Singida': 1481.0,
 'Tabora': 0.0,
 'Tanga': 293.0}

In [0]:
#These looks to be the problem areas. The median values for these regions should not be zeo
zero_regions = ['Dodoma', 'Kagera', 'Mbeya', 'Mwanza', 'Shinyanga', 'Tabora']

#I looked up the elevations and imputed the data
elevations['Dodoma'] = 1118
elevations['Kagera'] = 1500
elevations['Mbeya'] = 1700
elevations['Mwanza'] = 1140
elevations['Shinyanga'] = 1128
elevations['Tabora'] = 1191

#Use new values to clean zeroes in df based on imputed values
for key in elevations:
  if key in zero_regions:
    X.loc[X.region == key, 'gps_height'] = elevations[key]

In [20]:
X.describe()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year
count,59400.0,59400.0,59400.0,57588.0,57588.0,59400.0,59400.0,59400.0,38019.0,38691.0
mean,37115.131768,317.650385,1108.504865,35.149669,-5.885572,0.474141,15.297003,5.629747,281.087167,1996.814686
std,21453.128371,2997.574558,550.7165,2.607428,2.809876,12.23623,17.587406,9.633649,564.68766,12.472045
min,0.0,0.0,-90.0,29.607122,-11.64944,0.0,1.0,0.0,1.0,1960.0
25%,18519.75,0.0,836.0,33.2851,-8.643841,0.0,5.0,2.0,40.0,1987.0
50%,37061.5,0.0,1140.0,35.005943,-5.172704,0.0,12.0,3.0,150.0,2000.0
75%,55656.5,20.0,1500.0,37.233712,-3.372824,0.0,17.0,5.0,324.0,2008.0
max,74247.0,350000.0,2770.0,40.345193,-0.998464,1776.0,99.0,80.0,30500.0,2013.0


##Preliminary Models