In [None]:
import pandas as pd
import numpy as np

class OurTransform:

    # Constants distributed by processing
    UNNECESSARY_LABELS = ['buyer_d1', 'buyer_d7', 'buyer_d14', 'buyer_d28', 'buy_d7', 'buy_d14',
                            'buy_d28', 'iap_revenue_d14', 'iap_revenue_d28',
                            'registration', 'retention_d1_to_d7', 'retention_d3_to_d7',
                            'retention_d7_to_d14', 'retention_d1', 'retention_d3', 'retentiond7',]
    UNNECESSARY_FEATURES = ['datetime', 'bundles_ins', 'city_hist', 'region_hist', 'dev_osv_hist', 'first_request_ts', 'first_request_ts_bundle', 'first_request_ts_category_bottom_taxonomy', 'iap_revenue_usd_bundle', 'last_buy', 'last_buy_ts_bundle', 'last_buy_ts_category', 'hour_ratio', 'last_ins', 'last_install_ts_bundle', 'last_install_ts_category', 'advertiser_actions_action_last_timestamp', 'user_actions_bundles_action_last_timestamp', 'new_bundles', 'num_buys_bundle', 'user_bundles', 'user_bundles_l28d', 'advertiser_bundle', 'carrier', 'region', 'dev_model', 'dev_osv', 'hour', 'release_date', 'release_msrp', 'weekday', 'user_actions_bundles_action_count']
    
    FILLNA_BYAVG_NAMES = ['avg_act_days', 'weekend_ratio', 'wifi_ratio']
    FILLNA_BYMAX_NAMES = ['avg_days_ins', 'weeks_since_first_seen']

    TO_SEPARATE_NAMES = ['bcat']
    IGNORED_TO_SEPARATE = ['country_hist', 'bcat_bottom_taxonomy', 'bundles_cat', 'bundles_cat_bottom_taxonomy', 'cpm', 'cpm_pct_rk', 'ctr', 'ctr_pct_rk', 'dev_language_hist', 'iap_revenue_usd_category', 'iap_revenue_usd_category_bottom_taxonomy', 'num_buys_category', 'num_buys_category_bottom_taxonomy']
    
    ONE_HOT_ENCODER_NAMES = [ 'advertiser_category', 'country', 'dev_os']
    IGNORED_ONE_HOT_ENCODER_NAMES = ['last_advertiser_action', 'advertiser_subcategory', 'advertiser_bottom_taxonomy_level', 'dev_make']

    TO_REPLACE_BY_SUM = ['avg_daily_sessions', 'advertiser_actions_action_count', 'whale_users_bundle_total_num_buys', 'whale_users_bundle_total_revenue']
    TO_REPLACE_BY_MEAN = ['avg_duration', 'rev_by_adv', 'rwd_prank', 'whale_users_bundle_num_buys_prank', 'whale_users_bundle_revenue_prank']
    
    # Constants distributed by type
    LABEL_NAMES = ['buyer_d1', 'buyer_d7', 'buyer_d14', 'buyer_d28', 'buy_d7', 'buy_d14',
                    'buy_d28', 'iap_revenue_d7', 'iap_revenue_d14', 'iap_revenue_d28',
                    'registration', 'retention_d1_to_d7', 'retention_d3_to_d7',
                    'retention_d7_to_d14', 'retention_d1', 'retention_d3', 'retentiond7']
    REQUEST_RELATED_FEATURES = ['advertiser_bundle', 'advertiser_category', 'advertiser_subcategory',
                                'advertiser_bottom_taxonomy_level', 'carrier', 'country', 'region',
                                'dev_make', 'dev_model', 'dev_os', 'dev_osv', 'hour', 'release_date',
                                'release_msrp', 'weekday']
    USER_RELATED_FEATURES = ['avg_act_days', 'avg_daily_sessions',
                            'avg_days_ins', 'avg_duration', 'bcat', 'bcat_bottom_taxonomy',
                            'bundles_cat', 'bundles_cat_bottom_taxonomy', 'bundles_ins',
                            'city_hist', 'country_hist', 'cpm', 'cpm_pct_rk', 'ctr', 'ctr_pct_rk',
                            'dev_language_hist', 'dev_osv_hist', 'first_request_ts',
                            'first_request_ts_bundle', 'first_request_ts_category_bottom_taxonomy',
                            'hour_ratio', 'iap_revenue_usd_bundle', 'iap_revenue_usd_category',
                            'iap_revenue_usd_category_bottom_taxonomy', 'last_buy',
                            'last_buy_ts_bundle', 'last_buy_ts_category', 'last_ins',
                            'last_install_ts_bundle', 'last_install_ts_category',
                            'advertiser_actions_action_count',
                            'advertiser_actions_action_last_timestamp',
                            'user_actions_bundles_action_count',
                            'user_actions_bundles_action_last_timestamp', 'last_advertiser_action',
                            'new_bundles', 'num_buys_bundle', 'num_buys_category',
                            'num_buys_category_bottom_taxonomy', 'region_hist', 'rev_by_adv',
                            'rwd_prank', 'user_bundles', 'user_bundles_l28d', 'weekend_ratio',
                            'weeks_since_first_seen', 'wifi_ratio',
                            'whale_users_bundle_num_buys_prank', 'whale_users_bundle_revenue_prank',
                            'whale_users_bundle_total_num_buys', 'whale_users_bundle_total_revenue']
    AUX_NAMES = ['row_id', 'datetime']

    # Camps
    mean_dict: dict[str, float]
    max_dict: dict[str, float]
    categories_dict: dict[str, list[str]]

    def __init__ (self) -> None:
        pass
    
    def fit_transform (self, df: pd.DataFrame) -> pd.DataFrame:
        df = self.__remove_unnecessary_features(df, fit = True, test = False)
        df = self.__remove_empty_rows(df, fit = True, test = False)
        for col in self.FILLNA_BYMAX_NAMES:
            self.max_dict = {}
            df = self.__fillna_by_max(df, col = col, fit = True)
        for col in self.FILLNA_BYAVG_NAMES:
            self.mean_dict = {}
            df = self.__fillna_by_mean(df, col = col, fit = True)
        for col in self.TO_SEPARATE_NAMES:
            self.categories_dict = {}
            df = self.__value_hot_encoding(df, col = col, fit = True)
        for col in self.ONE_HOT_ENCODER_NAMES:
            df = self.__one_hot_encoding(df, col = col, fit = True)
        for col in self.TO_REPLACE_BY_MEAN:
            df = self.__replacement_by_mean(df, col = col, fit = True)
        for col in self.TO_REPLACE_BY_SUM:
            df = self.__replacement_by_sum(df, col = col, fit = True)
        return df
    
    def transform (self, df: pd.DataFrame, test: bool = False) -> pd.DataFrame:
        df = self.__remove_unnecessary_features(df, fit = False, test = test)
        df = self.__remove_empty_rows(df, fit = False, test = test)
        for col in self.FILLNA_BYMAX_NAMES:
            df = self.__fillna_by_max(df, col = col, fit = False)
        for col in self.FILLNA_BYAVG_NAMES:
            df = self.__fillna_by_mean(df, col = col, fit = False)
        for col in self.TO_SEPARATE_NAMES:
            df = self.__value_hot_encoding(df, col = col, fit = False)
        for col in self.ONE_HOT_ENCODER_NAMES:
            df = self.__one_hot_encoding(df, col = col, fit = False)
        for col in self.TO_REPLACE_BY_MEAN:
            df = self.__replacement_by_mean(df, col = col, fit = False)
        for col in self.TO_REPLACE_BY_SUM:
            df = self.__replacement_by_sum(df, col = col, fit = False)
        return df

    def __remove_unnecessary_features (self, df: pd.DataFrame, fit: bool = True, test: bool = False) -> pd.DataFrame:
        if test:
            return df.drop(columns = self.UNNECESSARY_FEATURES
                           + self.IGNORED_TO_SEPARATE
                           + self.IGNORED_ONE_HOT_ENCODER_NAMES)
        return df.drop(columns = self.UNNECESSARY_LABELS
                       + self.UNNECESSARY_FEATURES
                       + self.IGNORED_TO_SEPARATE
                       + self.IGNORED_ONE_HOT_ENCODER_NAMES)
    
    def __remove_empty_rows (self, df: pd.DataFrame, fit: bool = True, test: bool = False) -> pd.DataFrame:
        if test:
            return df
        # SPAGHETTTIIIIIIII
        # user_df = df.loc[:, self.USER_RELATED_FEATURES]\
        #             .drop(columns = list(set(self.FILLNA_BYAVG_NAMES
        #                                      + self.FILLNA_BYMAX_NAMES
        #                                      + self.TO_SEPARATE_NAMES
        #                                      + self.ONE_HOT_ENCODER_NAMES
        #                                      + self.TO_REPLACE_BY_MEAN
        #                                      + self.TO_REPLACE_BY_SUM)\
        #                                  .intersection(set(self.USER_RELATED_FEATURES))))
        user_df = df.drop(columns = list(set(df.columns) - set(self.USER_RELATED_FEATURES)))
        return df.loc[user_df.dropna(how = 'all').index, :]
    
    def __fillna_by_mean (self, df: pd.DataFrame, col: str, fit: bool = True) -> pd.DataFrame:
        if fit:
            self.mean_dict[col] = df[col].mean()
        df.loc[:, col] = df.loc[:, col].fillna(self.mean_dict[col])
        return df
    
    def __fillna_by_max (self, df: pd.DataFrame, col: str, fit: bool = True) -> pd.DataFrame:
        if fit:
            self.max_dict[col] = df[col].max()
        df.loc[:, col] = df.loc[:, col].fillna(self.max_dict[col])
        return df
    
    def __value_hot_encoding (self, df: pd.DataFrame, col: str, fit: bool = True) -> pd.DataFrame:
        OTHER = "Other"
        if fit:
            attrs = set()
            for row in df[col]:
                if isinstance(row, list):
                    for cat, _ in row:
                        attrs.add(cat.split("/")[0])
            attrs.add(OTHER)
            self.categories_dict[col] = attrs
        # Afegim columnes
        for new_col in self.categories_dict[col]:
            df[new_col] = 0
        # Fem allÃ² d'afegir els valors
        for idx, row in df.iterrows():
            if isinstance(row[col], list):
                for cat, val in row["bcat"]:
                    first_attr = cat.split("/")[0]
                    if first_attr in self.categories_dict[col]:
                        df.at[idx, first_attr] += val
                    else:
                        df.at[idx, OTHER] += val
        # Dropeja la original
        return df.drop(columns = [col])
    
    def __one_hot_encoding (self, df: pd.DataFrame, col: str, fit: bool = True) -> pd.DataFrame:
        return pd.get_dummies(df, columns = [col])
    
    def __replacement_by_sum (self, df: pd.DataFrame, col: str, fit: bool = True) -> pd.DataFrame:
        df[col] = df[col].apply(lambda lst : sum(v for _, v in lst) if isinstance(lst, list) else 0)
        return df
    
    def __replacement_by_mean (self, df: pd.DataFrame, col: str, fit: bool = True) -> pd.DataFrame:
        df[col] = df[col].apply(lambda lst : np.mean([v for _, v in lst]) if isinstance(lst, list) else 0)
        return df


In [18]:
import dask
import dask.dataframe as dd

_ = dask.config.set({"dataframe.convert-string": False})

dataset_path = "smadex-challenge-predict-the-revenue/train/train"
end_path = "smadex-challenge-predict-the-revenue/train_preprocess/train_preprocess"
ddf = dd.read_parquet(dataset_path)

transformer = OurTransform()
for i, part in enumerate(ddf.to_delayed()):
    df = part.compute()
    print(df["avg_days_ins"])
    if i == 0:
        df = transformer.fit_transform(df)
    else:
        df = transformer.transform(df, test = False)
    print(df)
    df.to_parquet(end_path + f"/parquet-{i}")
    if i == 4:
        break


0         17.0
1          NaN
2          NaN
3          NaN
4          NaN
          ... 
121882     NaN
121883     NaN
121884     NaN
121885     NaN
121886     NaN
Name: avg_days_ins, Length: 121887, dtype: float64


  return _methods._mean(a, axis=axis, dtype=dtype,


        iap_revenue_d7  avg_act_days  avg_daily_sessions  avg_days_ins  \
0             2.147718           2.0                   0          17.0   
2             0.000000           4.0                   5          28.0   
3             0.000000           3.0                   0          28.0   
4             0.000000           7.0                   8          28.0   
6             0.000000           4.0                   1          28.0   
...                ...           ...                 ...           ...   
121879        0.000000           7.0                   2           4.0   
121880        0.000000           4.5                   0          28.0   
121883        0.000000           2.0                   0          28.0   
121885        0.000000           2.0                   1          28.0   
121886        0.000000           7.0                   0          28.0   

        avg_duration  advertiser_actions_action_count  rev_by_adv  rwd_prank  \
0                0.0           

KeyError: 'avg_days_ins'