In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler

pd.options.display.max_rows = 35 
pd.options.display.max_columns = None

In [2]:
df = pd.read_csv('../../data/processed/WaterUpdated.csv').drop('id', axis = 1)
df.target.replace({'functional': 0, 'non functional': 1, 'functional needs repair': 1}, inplace = True)
print(df.shape)
df.head(2)

(57247, 21)


Unnamed: 0,amount_tsh,gps_height,installer,longitude,latitude,basin,region,lga,ward,population,public_meeting,permit,extraction_type,management,payment,water_quality,quantity,source,waterpoint_type,target,time_passed
0,6000.0,1390,Roman,34.938093,-9.856322,Lake Nyasa,Iringa,Ludewa,Mundindi,109.0,1,0,gravity,vwc,pay annually,soft,enough,spring,communal standpipe,0,12
1,0.0,1399,GRUMETI,34.698766,-2.147466,Lake Victoria,Mara,Serengeti,Natta,280.0,0,1,gravity,wug,never pay,soft,insufficient,rainwater harvesting,communal standpipe,0,3


In [4]:
# standard = StandardScaler() 
df.columns

Index(['amount_tsh', 'gps_height', 'installer', 'longitude', 'latitude',
       'basin', 'region', 'lga', 'ward', 'population', 'public_meeting',
       'permit', 'extraction_type', 'management', 'payment', 'water_quality',
       'quantity', 'source', 'waterpoint_type', 'target', 'time_passed'],
      dtype='object')

In [9]:
new_df = df
new_df = new_df.join(pd.get_dummies(df.extraction_type, prefix = 'extract'))
new_df = new_df.join(pd.get_dummies(df.quantity, prefix = 'quantity'))
new_df = new_df.join(pd.get_dummies(df.water_quality, prefix = 'quality'))
new_df = new_df.join(pd.get_dummies(df.source, prefix = 'source'))
new_df = new_df.join(pd.get_dummies(df.waterpoint_type, prefix = 'waterpoint'))


unique_extract = [f'extract_{i}' for i in df.extraction_type.unique()]
unique_waterpoint = [f'waterpoint_{i}' for i in df.waterpoint_type.unique()]
unique_source = [f'source_{i}' for i in df.source.unique() if i != 'unknown']
unique_quality = [f'quality_{i}' for i in df.water_quality.unique() if i != 'unknown']
unique_quantity = [f'quantity_{i}' for i in df.quantity.unique() if i != 'unknown']

col = ['amount_tsh', 'gps_height', 'population', 'permit', 'time_passed', 'target', 'longitude', 'latitude']
col = col + unique_basin + unique_extract + unique_waterpoint + unique_source + unique_quality + unique_quantity 
new_df = new_df[col]





57247 57247
0    27114
1    21545
Name: target, dtype: int64 0    4785
1    3803
Name: target, dtype: int64


In [63]:
import itertools

def basin_frame(basin, df):
    """return df from only the selected basin"""
    basin_df = df.loc[df.basin == basin]
    return basin_df

def get_frames(df):
    """return dfs of each basin in df"""
    import functools
    basins = df.basin.unique()
    return map(functools.partial(basin_frame, df=df), basins)

def return_resamp(df):
    """Return a resampled df based on balance of func/nonfunc values in the target df"""
    func_df = df.loc[df.target == 0]
    nonfunc_df = df.loc[df.target ==1]
    
    # build resampled df based off of which value of target is larger
    if len(func_df) > len(func_df):
        resamp_nonfunc = resample(nonfunc_df, n_samples = len(nonfunc_df), random_state = 10)
        resampled_concat = pd.concat([func_df, resamp_nonfunc])
    else:
        resamp_func = resample(func_df, n_samples = len(func_df), random_state = 10)
        resampled_concat = pd.concat([nonfunc_df, resamp_func])
    
    # return resampled in X, y feature target form
    X = resampled_concat.drop('target', axis=1)
    y = resampled_concat[['target']]
    return X, y

def return_resamp_feature_targets(dict_dfs):
    """Return resampled X,y splits from dict of dfs"""
    return [*map(return_resamp, dict_dfs.values())]

nya_df, vic_df, pang_df, ruv_df, int_df, tang_df, wami_df, ruf_df, rukwa_df = get_frames(df)

basin_frames = {'nya_df':nya_df, 'vic_df':vic_df, 'pang_df':pang_df, 'ruv_df':ruv_df, \
                'int_df':int_df, 'tang_df':tang_df, 'wami_df':wami_df, 'ruf_df':ruf_df, 'rukwa_df':rukwa_df}

X = list(map(lambda x: x + '.X', list(basin_frames.keys())))
y = list(map(lambda x: x + '.y', list(basin_frames.keys())))
X_train = [*map(lambda x: x + '_train', X)]
X_test = [*map(lambda x: x + '_test', X)]
y_train = [*map(lambda x: x + '_train', y)]
y_test = [*map(lambda x: x + '_test', y)]


resamp_frames = [(X[i], y[i]) for i in range(len(X))]

resamp_frames = return_resamp_feature_targets(basin_frames)

In [75]:
nya = basin_frames['nya_df']
nya.X, nya.y = return_resamp(nya)
func_df = nya.loc[nya.target == 0]
nonfunc_df = nya.loc[nya.target ==1]
resamp_nonfunc = resample(nonfunc_df, n_samples = len(nonfunc_df), random_state = 10)
resampled_concat = pd.concat([func_df, resamp_nonfunc])
resampled_concat.target.value_counts()

0    3309
1    1659
Name: target, dtype: int64

In [20]:
basins = df.basin.unique()
[*map(lambda x: x, basins)]

['Lake Nyasa',
 'Lake Victoria',
 'Pangani',
 'Ruvuma / Southern Coast',
 'Internal',
 'Lake Tanganyika',
 'Wami / Ruvu',
 'Rufiji',
 'Lake Rukwa']

In [None]:
#print(len(X), len(new_df))

# pickle.dump(X.drop('target', axis = 1), open('../../data/processed/pickles/X.p', 'wb'))
# pickle.dump(y, open('../../data/processed/pickles/y.p', 'wb'))

# x_train, x_test, y_train, y_test = train_test_split(X,y, stratify = X.target, random_state = 10, train_size = .85)
# print(y_train.target.value_counts(), y_test.target.value_counts())
# x_train = x_train.drop('target', axis =1)
# x_test = x_test.drop('target', axis =1)

# pickle.dump(y_test, open('../../data/processed/pickles/y_test.p', 'wb'))
# pickle.dump(y_train, open('../../data/processed/pickles/y_train.p', 'wb'))
# pickle.dump(x_train, open('../../data/processed/pickles/x_train.p', 'wb'))
# pickle.dump(x_test, open('../../data/processed/pickles/x_test.p', 'wb'))