In [53]:
import pandas as pd
import numpy as np

class OurTransform:

    # Constants distributed by processing
    UNNECESSARY_LABELS = ['buyer_d1', 'buyer_d7', 'buyer_d14', 'buyer_d28', 'buy_d7', 'buy_d14',
                            'buy_d28', 'iap_revenue_d14', 'iap_revenue_d28',
                            'registration', 'retention_d1_to_d7', 'retention_d3_to_d7',
                            'retention_d7_to_d14', 'retention_d1', 'retention_d3', 'retentiond7',]
    UNNECESSARY_FEATURES = ['datetime', 'bundles_ins', 'city_hist', 'region_hist', 'dev_osv_hist', 'first_request_ts', 'first_request_ts_bundle', 'first_request_ts_category_bottom_taxonomy', 'iap_revenue_usd_bundle', 'last_buy', 'last_buy_ts_bundle', 'last_buy_ts_category', 'hour_ratio', 'last_ins', 'last_install_ts_bundle', 'last_install_ts_category', 'advertiser_actions_action_last_timestamp', 'user_actions_bundles_action_last_timestamp', 'new_bundles', 'num_buys_bundle', 'user_bundles', 'user_bundles_l28d', 'advertiser_bundle', 'carrier', 'region', 'dev_model', 'dev_osv', 'hour', 'release_date', 'release_msrp', 'user_actions_bundles_action_count']
    
    FILLNA_BYAVG_NAMES = ['avg_act_days', 'weekend_ratio', 'wifi_ratio']
    FILLNA_BYMAX_NAMES = ['avg_days_ins', 'weeks_since_first_seen']

    TO_SEPARATE_NAMES = ['bcat', 'bcat_bottom_taxonomy']
    IGNORED_TO_SEPARATE = ['country_hist', 'bundles_cat', 'bundles_cat_bottom_taxonomy', 'cpm', 'cpm_pct_rk', 'ctr', 'ctr_pct_rk', 'dev_language_hist', 'iap_revenue_usd_category', 'iap_revenue_usd_category_bottom_taxonomy', 'num_buys_category', 'num_buys_category_bottom_taxonomy']
    
    ONE_HOT_ENCODER_NAMES = ['country', 'dev_os', 'advertiser_category']
    IGNORED_ONE_HOT_ENCODER_NAMES = ['dev_make', 'last_advertiser_action', 'advertiser_subcategory', 'advertiser_bottom_taxonomy_level']

    TO_REPLACE_BY_SUM = ['avg_daily_sessions', 'advertiser_actions_action_count', 'whale_users_bundle_total_num_buys', 'whale_users_bundle_total_revenue']
    TO_REPLACE_BY_MEAN = ['avg_duration', 'rev_by_adv', 'rwd_prank', 'whale_users_bundle_num_buys_prank', 'whale_users_bundle_revenue_prank']
    
    # Constants distributed by type
    LABEL_NAMES = ['buyer_d1', 'buyer_d7', 'buyer_d14', 'buyer_d28', 'buy_d7', 'buy_d14',
                    'buy_d28', 'iap_revenue_d7', 'iap_revenue_d14', 'iap_revenue_d28',
                    'registration', 'retention_d1_to_d7', 'retention_d3_to_d7',
                    'retention_d7_to_d14', 'retention_d1', 'retention_d3', 'retentiond7']
    REQUEST_RELATED_FEATURES = ['advertiser_bundle', 'advertiser_category', 'advertiser_subcategory',
                                'advertiser_bottom_taxonomy_level', 'carrier', 'country', 'region',
                                'dev_make', 'dev_model', 'dev_os', 'dev_osv', 'hour', 'release_date',
                                'release_msrp', 'weekday']
    USER_RELATED_FEATURES = ['avg_act_days', 'avg_daily_sessions',
                            'avg_days_ins', 'avg_duration', 'bcat', 'bcat_bottom_taxonomy',
                            'bundles_cat', 'bundles_cat_bottom_taxonomy', 'bundles_ins',
                            'city_hist', 'country_hist', 'cpm', 'cpm_pct_rk', 'ctr', 'ctr_pct_rk',
                            'dev_language_hist', 'dev_osv_hist', 'first_request_ts',
                            'first_request_ts_bundle', 'first_request_ts_category_bottom_taxonomy',
                            'hour_ratio', 'iap_revenue_usd_bundle', 'iap_revenue_usd_category',
                            'iap_revenue_usd_category_bottom_taxonomy', 'last_buy',
                            'last_buy_ts_bundle', 'last_buy_ts_category', 'last_ins',
                            'last_install_ts_bundle', 'last_install_ts_category',
                            'advertiser_actions_action_count',
                            'advertiser_actions_action_last_timestamp',
                            'user_actions_bundles_action_count',
                            'user_actions_bundles_action_last_timestamp', 'last_advertiser_action',
                            'new_bundles', 'num_buys_bundle', 'num_buys_category',
                            'num_buys_category_bottom_taxonomy', 'region_hist', 'rev_by_adv',
                            'rwd_prank', 'user_bundles', 'user_bundles_l28d', 'weekend_ratio',
                            'weeks_since_first_seen', 'wifi_ratio',
                            'whale_users_bundle_num_buys_prank', 'whale_users_bundle_revenue_prank',
                            'whale_users_bundle_total_num_buys', 'whale_users_bundle_total_revenue']
    AUX_NAMES = ['row_id', 'datetime']

    # Camps
    mean_dict: dict[str, float]
    max_dict: dict[str, float]
    categories_dict: dict[str, list[str]]
    total_n_categories: int
    total_n_numerics: int

    def __init__ (self) -> None:
        self.total_n_categories = 0
        self.total_n_numerics = 0
    
    def fit_transform (self, df: pd.DataFrame) -> pd.DataFrame:
        df = self.__remove_unnecessary_features(df, fit = True, test = False)
        df = self.__remove_empty_rows(df, fit = True, test = False)
        self.max_dict = {}
        for col in self.FILLNA_BYMAX_NAMES:
            df = self.__fillna_by_max(df, col = col, fit = True)
        self.mean_dict = {}
        for col in self.FILLNA_BYAVG_NAMES:
            df = self.__fillna_by_mean(df, col = col, fit = True)
        self.categories_dict = {}
        for col in self.TO_SEPARATE_NAMES:
            df = self.__value_hot_encoding(df, col = col, fit = True)
        for col in self.ONE_HOT_ENCODER_NAMES:
            df = self.__one_hot_encoding(df, col = col, fit = True)
        for col in self.TO_REPLACE_BY_MEAN:
            df = self.__replacement_by_mean(df, col = col, fit = True)
        for col in self.TO_REPLACE_BY_SUM:
            df = self.__replacement_by_sum(df, col = col, fit = True)
        self.total_n_numerics = df.shape[1] - self.total_n_categories - len(self.UNNECESSARY_LABELS)
        return df
    
    def transform (self, df: pd.DataFrame, test: bool = False) -> pd.DataFrame:
        df = self.__remove_unnecessary_features(df, fit = False, test = test)
        df = self.__remove_empty_rows(df, fit = False, test = test)
        for col in self.FILLNA_BYMAX_NAMES:
            df = self.__fillna_by_max(df, col = col, fit = False)
        for col in self.FILLNA_BYAVG_NAMES:
            df = self.__fillna_by_mean(df, col = col, fit = False)
        for col in self.TO_SEPARATE_NAMES:
            df = self.__value_hot_encoding(df, col = col, fit = False)
        for col in self.ONE_HOT_ENCODER_NAMES:
            df = self.__one_hot_encoding(df, col = col, fit = False)
        for col in self.TO_REPLACE_BY_MEAN:
            df = self.__replacement_by_mean(df, col = col, fit = False)
        for col in self.TO_REPLACE_BY_SUM:
            df = self.__replacement_by_sum(df, col = col, fit = False)
        return df

    def __remove_unnecessary_features (self, df: pd.DataFrame, fit: bool = True, test: bool = False) -> pd.DataFrame:
        # if test:
        #     return df.drop(columns = self.UNNECESSARY_FEATURES
        #                    + self.IGNORED_TO_SEPARATE
        #                    + self.IGNORED_ONE_HOT_ENCODER_NAMES)
        return df.drop(columns = self.UNNECESSARY_FEATURES
                       + self.IGNORED_TO_SEPARATE
                       + self.IGNORED_ONE_HOT_ENCODER_NAMES)
    
    def __remove_empty_rows (self, df: pd.DataFrame, fit: bool = True, test: bool = False) -> pd.DataFrame:
        if test:
            return df
        # SPAGHETTTIIIIIIII
        # user_df = df.loc[:, self.USER_RELATED_FEATURES]\
        #             .drop(columns = list(set(self.FILLNA_BYAVG_NAMES
        #                                      + self.FILLNA_BYMAX_NAMES
        #                                      + self.TO_SEPARATE_NAMES
        #                                      + self.ONE_HOT_ENCODER_NAMES
        #                                      + self.TO_REPLACE_BY_MEAN
        #                                      + self.TO_REPLACE_BY_SUM)\
        #                                  .intersection(set(self.USER_RELATED_FEATURES))))
        user_df = df.drop(columns = list(set(df.columns) - set(self.USER_RELATED_FEATURES)))
        return df.loc[user_df.dropna(how = 'all').index, :]
    
    def __fillna_by_mean (self, df: pd.DataFrame, col: str, fit: bool = True) -> pd.DataFrame:
        if fit:
            self.mean_dict[col] = df[col].mean()
        df.loc[:, col] = df.loc[:, col].fillna(self.mean_dict[col])
        return df
    
    def __fillna_by_max (self, df: pd.DataFrame, col: str, fit: bool = True) -> pd.DataFrame:
        if fit:
            self.max_dict[col] = df[col].max()
        df.loc[:, col] = df.loc[:, col].fillna(self.max_dict[col])
        return df
    
    def __value_hot_encoding (self, df: pd.DataFrame, col: str, fit: bool = True) -> pd.DataFrame:
        OTHER = "other"
        if fit:
            attrs = set()
            for row in df[col]:
                if isinstance(row, list):
                    for cat, _ in row:
                        attrs.add(col + "_" + cat.split("/")[0])
            attrs.add(col + "_" + OTHER)
            self.categories_dict[col] = attrs
        # Afegim columnes
        for new_col in self.categories_dict[col]:
            df[new_col] = 0
        # Fem allÃ² d'afegir els valors
        for idx, row in df.iterrows():
            if isinstance(row[col], list):
                for cat, val in row[col]:
                    first_attr = col + "_" + cat.split("/")[0]
                    if first_attr in self.categories_dict[col]:
                        df.at[idx, first_attr] += val
                    else:
                        df.at[idx, col + "_" + OTHER] += val
        # Dropeja la original
        # print(df.loc[:, self.categories_dict[col]].describe())
        return df.drop(columns = [col])
    
    def __one_hot_encoding (self, df: pd.DataFrame, col: str, fit: bool = True) -> pd.DataFrame:
        OTHER = "other"
        df[col] = df[col].fillna(OTHER)
        if fit:
            # self.categories_dict[col] = list(col + "_" + df[col].unique())
            # self.categories_dict[col] = [c.lower() for c in self.categories_dict[col]]
            # print(self.categories_dict[col])
            original_len = df.shape[1]
            df = pd.get_dummies(df, columns = [col])
            new_columns = list(df.columns[original_len - 1:])
            if (col + "_" + OTHER) not in df.columns:
                df[col + "_" + OTHER] = 0
                new_columns.append(col + "_" + OTHER)
            self.categories_dict[col] = new_columns
            self.total_n_categories += len(self.categories_dict[col])
        else:
            for new_col in self.categories_dict[col]:
                df[new_col] = 0
            for idx, row in df.iterrows():
                first_attr = col + "_" + row[col].lower()
                if first_attr in self.categories_dict[col]:
                    df.at[idx, first_attr] = 1
                else:
                    df.at[idx, col + "_" + OTHER] = 1
            df = df.drop(columns = [col])
        # for new_col in self.categories_dict[col]:
        #     print(df[new_col].value_counts())
        return df
    
    def __replacement_by_sum (self, df: pd.DataFrame, col: str, fit: bool = True) -> pd.DataFrame:
        df[col] = df[col].apply(lambda lst : sum(v for _, v in lst) if isinstance(lst, list) else 0)
        return df
    
    def __replacement_by_mean (self, df: pd.DataFrame, col: str, fit: bool = True) -> pd.DataFrame:
        df[col] = df[col].apply(lambda lst : np.mean([v for _, v in lst]) if isinstance(lst, list) else 0)
        return df


In [54]:
import dask
import dask.dataframe as dd
import warnings
warnings.filterwarnings("ignore")

_ = dask.config.set({"dataframe.convert-string": False})

dataset_path = "smadex-challenge-predict-the-revenue/train/train"
testset_path = "smadex-challenge-predict-the-revenue/test/test"
end_path = "smadex-challenge-predict-the-revenue/train_preprocess/train_preprocess-v2"
endtest_path = "smadex-challenge-predict-the-revenue/test_preprocess/test_preprocess-v2"

ddf = dd.read_parquet(dataset_path)

N = 20
transformer = OurTransform()
for i, part in enumerate(ddf.to_delayed()):
    df = part.compute()
    if i == 0:
        df = transformer.fit_transform(df)
    else:
        df = transformer.transform(df, test = False)
    if i == 0:
        columns = df.columns
    else:
        print(columns)
        print(df.columns)
        print(pd.DataFrame(columns == df.columns).value_counts())
    df.to_parquet(end_path + f"/parquet-{i}")
    if i == N - 1:
        break

ddf_test = dd.read_parquet(testset_path)

for i, part in enumerate(ddf_test.to_delayed()):
    df = part.compute()
    df = transformer.transform(df, test = True)
    df.to_parquet(endtest_path + f"/parquet-{i}")

Index(['buyer_d1', 'buyer_d7', 'buyer_d14', 'buyer_d28', 'buy_d7', 'buy_d14',
       'buy_d28', 'iap_revenue_d7', 'iap_revenue_d14', 'iap_revenue_d28',
       ...
       'advertiser_category_other', 'advertiser_category_photo & video',
       'advertiser_category_productivity',
       'advertiser_category_real money casino', 'advertiser_category_shopping',
       'advertiser_category_social networking',
       'advertiser_category_sport betting', 'advertiser_category_sports',
       'advertiser_category_travel', 'advertiser_category_utilities'],
      dtype='object', length=528)
Index(['buyer_d1', 'buyer_d7', 'buyer_d14', 'buyer_d28', 'buy_d7', 'buy_d14',
       'buy_d28', 'iap_revenue_d7', 'iap_revenue_d14', 'iap_revenue_d28',
       ...
       'advertiser_category_other', 'advertiser_category_photo & video',
       'advertiser_category_productivity',
       'advertiser_category_real money casino', 'advertiser_category_shopping',
       'advertiser_category_social networking',
       

OSError: Cannot save file into a non-existent directory: 'smadex-challenge-predict-the-revenue/test_preprocess/test_preprocess-v2'

In [56]:
for i, part in enumerate(ddf_test.to_delayed()):
    df = part.compute()
    df = transformer.transform(df, test = True)
    df.to_parquet(endtest_path + f"/parquet-{i}")

KeyboardInterrupt: 

In [58]:
print(transformer.total_n_categories)
print(transformer.total_n_numerics)

223
289
