In [10]:
import numpy as np
import pandas as pd

%matplotlib inline

# ETL

In [11]:
def transform(df):
    df['date_recorded'] = pd.to_datetime(df['date_recorded'])
    df['amount_tsh_is_zero'] = df['amount_tsh'] < 1e-6
    df['amount_tsh'] = np.log10(df['amount_tsh'] + 1)
    df['is_govt_funded'] = df['funder'] == 'Government Of Tanzania'
    df.loc[df['longitude'] < 5, ['longitude', 'latitude']] = None
    df.loc[df['wpt_name'] == 'none', 'wpt_name'] = None
    df['num_private_is_zero'] = df['num_private'] == 0
    df['num_private'] = np.log10(df['num_private'] + 1)
    df['population_is_zero'] = df['population'] == 0
    df['population_is_one'] = df['population'] == 1
    df['population'] = np.log10(df['population'] + 1)
    df.drop(columns=['recorded_by'], inplace=True)
    df.loc[df['scheme_name'] == 'None', 'scheme_name'] = None
    df.loc[df['construction_year'] == 0, 'construction_year'] = None
    df['age'] = df['date_recorded'].dt.year - df['construction_year']
    df['age'].clip_lower(0, inplace=True)
    df.drop(columns=['payment'], inplace=True)
    df.drop(columns=['quantity_group'], inplace=True)
    return df

## Training set

In [12]:
X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv')

In [13]:
train = pd.merge(X_train, y_train, on='id')
train.set_index('id', inplace=True)

In [14]:
train = transform(train)

In [15]:
train.to_parquet('train.parquet', compression='gzip')

## Test set

In [16]:
test = pd.read_csv('X_test.csv')

In [17]:
test = transform(test)

In [18]:
test.to_parquet('test.parquet', compression='gzip')
