In [1]:
import pandas as pd
import numpy as np
import pickle
import yaml
from yaml.loader import SafeLoader
import os
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# Leemos los parámetros del preprocess

In [2]:
# Cambiamos el directorio de trabajo a la carpeta models
os.chdir('../../')

In [3]:
print("Directorio: {0}".format(os.getcwd()))

Directorio: /Users/mohamed.rios/Projects/desafio_etermax/models


In [4]:
with open('resources.yaml') as f:
    resources = yaml.load(f, Loader=SafeLoader)
    params = resources.get('preprocess_params')
    pipeline_path = resources.get('artifact_path')

# Lectura del raw dataset

In [5]:
df = pd.read_csv(params['raw_data_path'])

In [6]:
df.head()

Unnamed: 0,user_id,country,source,platform,device_family,event_1,event_2,revenue
0,44023fc9,it,Organic,iOS,Apple iPhone,28,1,0.097079
1,990ce89e,de,Organic,Android,Samsung Galaxy Phone,51,5,0.233669
2,0c103f51,pt,Organic,iOS,Apple iPhone,647,157,1.630575
3,ab2de453,pt,Organic,iOS,Apple iPad,35,0,0.084876
4,45c0aca3,fr,Non-Organic,Android,Samsung Galaxy Phone,0,0,0.0


# Preprocess dataset

### Pipeline functions

In [7]:
def apply_pipeline(f_list: list, df: pd.DataFrame, params=params) -> pd.DataFrame:
    for f in f_list:
        df = f(df, params)
    return df

### Preprocess Functions

In [8]:
def delete_outliers(df: pd.DataFrame, params) -> pd.DataFrame:
    df = df[
                (df['revenue'] < params['max_revenue']) 
              & (df['event_1'] < params['max_event_1']) 
              & (df['event_2'] < params['max_event_2'])
    ]
    return df


def create_country_features(df: pd.DataFrame, params) -> pd.DataFrame:
    df['country_category'] = df.country.str.lower()
    df['country_category'] = np.where(df['country_category'].isin(params['valid_countries']),
                                        df['country_category'],
                                        'otros')
    return df


def create_source_features(df: pd.DataFrame, params) -> pd.DataFrame:
    df['source_category'] = df.source.str.lower()
    df['source_category'] = np.where(~df['source_category'].isin(params['valid_sources']),
                                         'desconocido', 
                                         df['source_category'])
    return df


def create_platform_features(df: pd.DataFrame, params) -> pd.DataFrame:
    df['platform_category'] = df.platform.str.lower()
    df['platform_category'] = np.where(~df['platform_category'].isin(params['valid_platforms']),
                                         'desconocido',
                                         df['platform_category'])
    return df


def create_device_features(df: pd.DataFrame, params) -> pd.DataFrame:
    df['device_category'] = df.device_family.str.lower()
    df['device_category'] = np.where(~df['device_category'].isin(params['valid_devices']),
                                         'otros', 
                                         df['device_category'])
    return df


def create_event_1_flag(df: pd.DataFrame, params) -> pd.DataFrame:
    df['has_event_1'] = np.where(df['event_1'] > 0, 1, 0)
    return df


def create_event_2_flag(df: pd.DataFrame, params) -> pd.DataFrame:
    df['has_event_2'] = np.where(df['event_2'] > 0, 1, 0)
    return df


def drop_metadata_columns(df: pd.DataFrame, params) -> pd.DataFrame:
    return df.drop(axis=1, columns=params['metadata_columns'])



In [9]:
df_processed = delete_outliers(df.copy(), params)
x = df_processed.drop(axis=1, columns=['revenue'])
y = df_processed.revenue
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

In [10]:
preprocess_f_list = [create_country_features,
                    create_source_features, create_platform_features, create_device_features,
                    create_event_1_flag, create_event_2_flag, drop_metadata_columns]
x_train = apply_pipeline(preprocess_f_list, x_train)
x_test = apply_pipeline(preprocess_f_list, x_test)

In [11]:
x_train.head(3)

Unnamed: 0,event_1,event_2,country_category,source_category,platform_category,device_category,has_event_1,has_event_2
24650,0,0,es,organic,android,otros,0,0
28516,3,0,fr,non-organic,android,otros,1,0
905,29,0,es,organic,android,otros,1,0


# Scikit pipeline

In [12]:
ohe = OneHotEncoder()
scaler = StandardScaler()
normalizer = Normalizer()
column_transformer = make_column_transformer(
                (ohe, x_train.select_dtypes(include='object').columns),
                (scaler, params['numerical_colums']),
                (normalizer, params['numerical_colums'])
)

In [13]:
pipeline = make_pipeline(column_transformer)

In [14]:
pipeline.fit(x_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('onehotencoder',
                                                  OneHotEncoder(),
                                                  Index(['country_category', 'source_category', 'platform_category',
       'device_category'],
      dtype='object')),
                                                 ('standardscaler',
                                                  StandardScaler(),
                                                  ['event_1', 'event_2']),
                                                 ('normalizer', Normalizer(),
                                                  ['event_1', 'event_2'])]))])

In [15]:
x_train_t = pipeline.transform(x_train)
x_test_t = pipeline.transform(x_test)

# Guardo preprocess pipeline

In [16]:
with open(pipeline_path + '/pipeline','wb') as outfile:
        pickle.dump(pipeline, outfile)

# Escribo datasets procesados

In [17]:

pd.DataFrame(x_train_t).to_csv(params['x_train_path'], index=False)
pd.DataFrame(x_test_t).to_csv(params['x_test_path'], index=False)
pd.DataFrame(y_train).to_csv(params['y_train_path'], index=False)
pd.DataFrame(y_test).to_csv(params['y_test_path'], index=False)