<a href="https://colab.research.google.com/github/ewuerfel66/DS-Unit-2-Linear-Models/blob/master/LogisticRegression_EricWuerfel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tanzania Water Pumps

## Imports

In [72]:
# !pip install -U pandas-profiling
!pip install category-encoders



In [0]:
# libraries
import pandas as pd
import pandas_profiling
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import category_encoders as ce

In [0]:
# Data
LOCAL = '../data/tanzania/'
WEB = 'https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Linear-Models/master/data/tanzania/'

train_features = pd.read_csv(WEB + 'train_features.csv')
train_labels = pd.read_csv(WEB + 'train_labels.csv')
test_features = pd.read_csv(WEB + 'test_features.csv')
sample_submission = pd.read_csv(WEB + 'sample_submission.csv')

assert train_features.shape == (59400, 40)
assert train_labels.shape == (59400, 2)
assert test_features.shape == (14358, 40)
assert sample_submission.shape == (14358, 2)

## Data Exploration

In [75]:
train_features.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109,True,GeoData Consultants Ltd,VWC,Roman,False,1999,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,280,,GeoData Consultants Ltd,Other,,True,2010,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,Pangani,Majengo,Manyara,21,4,Simanjiro,Ngorika,250,True,GeoData Consultants Ltd,VWC,Nyumba ya mungu pipe scheme,True,2009,gravity,gravity,gravity,vwc,user-group,pay per bucket,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,Mahakamani,Mtwara,90,63,Nanyumbu,Nanyumbu,58,True,GeoData Consultants Ltd,VWC,,True,1986,submersible,submersible,submersible,vwc,user-group,never pay,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,Lake Victoria,Kyanyamisa,Kagera,18,1,Karagwe,Nyakasimbi,0,True,GeoData Consultants Ltd,,,True,0,gravity,gravity,gravity,other,other,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


## Majority Class Baseline

In [76]:
y_train = train_labels['status_group']
y_train.value_counts(normalize=True)

functional                 0.543081
non functional             0.384242
functional needs repair    0.072677
Name: status_group, dtype: float64

`functional` is the majority class.

In [0]:
# Our baseline model
majority_class = y_train.mode()[0]
y_pred = [majority_class] * len(y_train)

In [78]:
# Check the accuracy score
accuracy_score(y_train, y_pred)

0.543080808080808

## Train/Validate/Test Split

### *RUN THIS TO RESET TRAINING AND VALIDATION SETS*

In [79]:
# Train/Test already Split
X_train = train_features
y_train = train_labels['status_group']

# Validation Split
X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                  y_train,
                                                  train_size=0.8,
                                                  test_size=0.2,
                                                  stratify=y_train,
                                                  random_state=66)

# Check the shape
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((47520, 40), (11880, 40), (47520,), (11880,))

In [80]:
# Doublecheck Stratification
assert(y_train.value_counts(normalize=True)[0] - y_val.value_counts(normalize=True)[0] < 0.01)
print('Stratification is fine')

Stratification is fine


## Basic Logistic Regression

In [0]:
# Only take numeric data
X_train_numeric = X_train.select_dtypes('number')
X_val_numeric = X_val.select_dtypes('number')

In [82]:
# Instantiate model
model = LogisticRegression(solver='lbfgs',
                         multi_class='auto',
                         max_iter=10000)

# Fit model
model.fit(X_train_numeric, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [83]:
# Make predictions
y_pred = model.predict(X_val_numeric)

# Test the accuracy of the predictions
accuracy_score(y_val, y_pred)

0.5572390572390572

## Logistic Regression
* Numeric features except `id`
* OneHotEncode `quantity`

In [84]:
# Train/Test already Split
X_train = train_features
y_train = train_labels['status_group']

# Validation Split
X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                  y_train,
                                                  train_size=0.8,
                                                  test_size=0.2,
                                                  stratify=y_train,
                                                  random_state=66)

# Check the shape
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((47520, 40), (11880, 40), (47520,), (11880,))

In [85]:
# Find all the numeric features
X_train_numeric = X_train.select_dtypes(include='number')
numeric_features = list(X_train_numeric.columns.values)
numeric_features.remove('id')

# Add quantity
features = numeric_features
features.append('source')

features

['amount_tsh',
 'gps_height',
 'longitude',
 'latitude',
 'num_private',
 'region_code',
 'district_code',
 'population',
 'construction_year',
 'source']

In [0]:
# Train/Test already Split
X_train = train_features[features]
y_train = train_labels['status_group']

# Remove features we don't want from X_val
X_val = X_val[features]

# OneHotEncode `quantity`
encoder = ce.OneHotEncoder(use_cat_names=True)
X_train = encoder.fit_transform(X_train)
X_val = encoder.transform(X_val)

# Scale the data (Unit Normal)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

In [87]:
# Instantiate model
model = LogisticRegression(solver='lbfgs',
                         multi_class='auto',
                         max_iter=10000)

# Fit model
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [88]:
# Make predictions
y_pred = model.predict(X_val)

# Test the accuracy of the predictions
accuracy_score(y_val, y_pred)

0.5773569023569024

## Logistic Regression
* Numeric features except `id`
* OneHotEncode ``

In [89]:
# Train/Test already Split
X_train = train_features
y_train = train_labels['status_group']

# Validation Split
X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                  y_train,
                                                  train_size=0.8,
                                                  test_size=0.2,
                                                  stratify=y_train,
                                                  random_state=66)

# Check the shape
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((47520, 40), (11880, 40), (47520,), (11880,))

In [0]:
# Find all the numeric features
X_train_numeric = X_train.select_dtypes(include='number')
numeric_features = list(X_train_numeric.columns.values)
numeric_features.remove('id')
numeric_features.remove('num_private')

# Add features to be encoded
features = numeric_features
features.append('quantity')
features.append('installer')
features.append('extraction_type')
features.append('source')

# features

In [0]:
# Train/Test already Split
X_train = train_features[features]
y_train = train_labels['status_group']

# Remove features we don't want from X_val
X_val = X_val[features]

# OneHotEncode `quantity`
encoder = ce.OneHotEncoder(use_cat_names=True)
X_train = encoder.fit_transform(X_train)
X_val = encoder.transform(X_val)

# Scale the data (Unit Normal)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

In [92]:
# Instantiate model
model = LogisticRegression(solver='lbfgs',
                         multi_class='auto',
                         max_iter=10000)

# Fit model
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [93]:
# Make predictions
y_pred = model.predict(X_val)

# Test the accuracy of the predictions
accuracy_score(y_val, y_pred)

0.7494949494949495

In [94]:
X_val.shape

(11880, 2187)

## Export a Submission

In [0]:
# Create & process y_test
X_test = test_features[features]

# Encode
X_test = encoder.transform(X_test)

# Scale to Unit Normal
X_test = scaler.transform(X_test)

In [0]:
# Make predictions
y_pred = model.predict(X_test)

In [108]:
# Send predictions to df
submission_df = pd.DataFrame()

# Add columns
submission_df['id'] = True
submission_df['id'] = test_features['id'].tolist()

submission_df['status_group'] = True
submission_df['status_group'] = y_pred

submission_df.head()

Unnamed: 0,id,status_group
0,50785,functional
1,51630,functional
2,17168,functional
3,45559,non functional
4,49871,functional


In [0]:
submission_df.to_csv('submission', index=False)