In [713]:
import sys
import pandas as pd
import numpy as np
import seaborn as sns

## Load Data

In [1040]:
path = '/Users/ridleyleisy/Documents/lambda/unit_two/DS-Unit-2-Classification-1/ds4-predictive-modeling-challenge/'

In [1230]:
train = pd.read_csv(path + 'train_features.csv')
test = pd.read_csv(path + 'test_features.csv')
labels = pd.read_csv(path + 'train_labels.csv')
sample_submission = pd.read_csv('https://drive.google.com/uc?export=download&id=1kfJewnmhowpUo381oSn3XqsQ6Eto23XV')

## clean data

### numeric

In [1231]:
def drop_cols_rows(df):
    df.drop('num_private',axis=1,inplace=True)
    df.drop('recorded_by',axis=1,inplace=True)
    df = df.loc[df['longitude'] != 0]
    return df

In [1232]:
train = drop_cols_rows(train)

In [1233]:
labels = labels.merge(train,on='id')[['id','status_group']]

## Train construction year data

In [1234]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [1235]:
def transform_construction(df):
    df['construction_year'] = df['construction_year'].replace(0,np.nan)
    df = df.select_dtypes(include=np.number)
    X = df.loc[~df['construction_year'].isna()]
    
    # can only use these featuers since they differ 
    features = ['amount_tsh', 'gps_height', 'longitude', 'latitude',
       'region_code', 'district_code', 'population']
    target = 'construction_year'
    
    X_train = X[features]
    y_train = X[target]
    
    X_train, X_test, y_train, y_test = train_test_split(X_train,y_train)
    m = DecisionTreeRegressor(max_depth=20)
    m.fit(X_train,y_train)
    
    vals = m.predict(df.loc[df['construction_year'].isna()][features])
    
    return vals

In [1236]:
train.loc[train['construction_year'].isna(),'construction_year'] = transform_construction(train)
test.loc[test['construction_year'].isna(),'construction_year'] = transform_construction(test)

In [1237]:
train['construction_year'] = round(train['construction_year'])
test['construction_year'] = round(test['construction_year'])

### feature engineering

In [1238]:
def add_construction_diff(df):
    df['date_recorded'] = pd.to_datetime(df['date_recorded'])
    df['construction_year'] = pd.to_datetime(df['construction_year'],format="%Y")
    df['time_since_construction'] = (df['date_recorded'] - df['construction_year']).dt.days
    return df

In [1239]:
test = add_construction_diff(test)
train = add_construction_diff(train)

## Encoding Categorical Data

In [1240]:
train.describe(exclude=np.number).T.sort_values(by='unique')

Unnamed: 0,count,unique,top,freq,first,last
public_meeting,54612,2,True,49737,,
permit,54532,2,True,38100,,
source_class,57588,3,groundwater,44204,,
quantity,57588,5,enough,32260,,
quantity_group,57588,5,enough,32260,,
management_group,57588,5,user-group,50767,,
quality_group,57588,6,good,49431,,
waterpoint_type_group,57588,6,communal standpipe,34334,,
payment_type,57588,7,never pay,24380,,
source_type,57588,7,spring,17006,,


Let's encode
1. source_class
2. quantity
3. management_group
4. payment type
5. source type

In [1241]:
import category_encoders as ce
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [1242]:
train.describe()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,region_code,district_code,population,time_since_construction
count,57588.0,57588.0,57588.0,57588.0,57588.0,57588.0,57588.0,57588.0,57588.0
mean,37106.48807,327.645219,689.325137,35.149669,-5.885572,15.217615,5.728311,185.570831,5878.75198
std,21454.51421,3043.831403,693.564188,2.607428,2.809876,17.855254,9.760254,477.744239,4479.305715
min,0.0,0.0,-90.0,29.607122,-11.64944,1.0,0.0,0.0,-3044.0
25%,18522.75,0.0,0.0,33.2851,-8.643841,5.0,2.0,0.0,1887.0
50%,37054.5,0.0,426.0,35.005943,-5.172704,12.0,3.0,35.0,5143.0
75%,55667.25,30.0,1332.0,37.233712,-3.372824,17.0,5.0,230.0,8101.0
max,74247.0,350000.0,2770.0,40.345193,-0.998464,99.0,80.0,30500.0,19447.0


In [1314]:
numeric_features = ['amount_tsh', 'gps_height', 'longitude', 'latitude','time_since_construction',
       'region_code', 'district_code', 'population']
encode_features = ['source_class','permit','water_quality','quantity','installer','subvillage'
                   ,'extraction_type','payment','source_type','lga','quality_group','waterpoint_type','basin']
features = numeric_features + encode_features

In [1315]:
X_train = train[features]
y_train = labels['status_group']

In [1316]:
X_train, X_test, y_train, y_test = train_test_split(
    X_train, y_train, train_size=0.80, test_size=0.20, 
    stratify=y_train, random_state=42)

In [1317]:
encoder = ce.OneHotEncoder(use_cat_names=True)

In [1318]:
X_train_sub = X_train[features]
X_test_sub = X_test[features]

In [1319]:
X_train_sub_encoded = encoder.fit_transform(X_train_sub)
X_test_sub_encoded = encoder.transform(X_test_sub)

KeyboardInterrupt: 

In [None]:
scaler = StandardScaler()

In [None]:
X_train_scaled = scaler.fit_transform(X_train_sub_encoded)
X_test_scaled = scaler.transform(X_test_sub_encoded)

In [None]:
model = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=1000)
model.fit(X_train_scaled, y_train)

In [None]:
model.score(X_test_scaled,y_test)

## Predicting for Kaggle

In [1309]:
test_sub = test[features]
test_sub_encoded = encoder.transform(test_sub)
test_sub_scaled = scaler.transform(test_sub_encoded)

  This is separate from the ipykernel package so we can avoid doing imports until


In [1310]:
preds = model.predict(test_sub_scaled)

In [1311]:
submission = pd.DataFrame(test['id'])

In [1312]:
submission['status_group'] = preds

In [1313]:
submission.to_csv('test_submission.csv',index=False)