In [None]:
import os
import gc
import tqdm
import pickle
import numpy as np
import pandas as pd

In [None]:
gc.collect()

In [None]:
chunk_path_list = ['../input/hm-trained-models/hybrid_generic/top_200_articles_chunk_1.pkl',
                   '../input/hm-trained-models/hybrid_generic/top_200_articles_chunk_2.pkl',
                   '../input/hm-trained-models/hybrid_generic/top_200_articles_chunk_3.pkl',
                   '../input/hm-trained-models/hybrid_generic/top_200_articles_chunk_4.pkl']

In [None]:
def reduce_mem_usage(df, int_cast=True, obj_to_category=False, subset=None):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.
    :param df: dataframe to reduce (pd.DataFrame)
    :param int_cast: indicate if columns should be tried to be casted to int (bool)
    :param obj_to_category: convert non-datetime related objects to category dtype (bool)
    :param subset: subset of columns to analyse (list)
    :return: dataset with the column dtypes adjusted (pd.DataFrame)
    """
    start_mem = df.memory_usage().sum() / 1024 ** 2;
    gc.collect()
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    cols = subset if subset is not None else df.columns.tolist()

    for col in tqdm.tqdm(cols):
        col_type = df[col].dtype

        if col_type != object and col_type.name != 'category' and 'datetime' not in col_type.name:
            c_min = df[col].min()
            c_max = df[col].max()

            # test if column can be converted to an integer
            treat_as_int = str(col_type)[:3] == 'int'
            if int_cast and not treat_as_int:
                treat_as_int = check_if_integer(df[col])

            if treat_as_int:
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.uint8).min and c_max < np.iinfo(np.uint8).max:
                    df[col] = df[col].astype(np.uint8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.uint16).min and c_max < np.iinfo(np.uint16).max:
                    df[col] = df[col].astype(np.uint16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.uint32).min and c_max < np.iinfo(np.uint32).max:
                    df[col] = df[col].astype(np.uint32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
                elif c_min > np.iinfo(np.uint64).min and c_max < np.iinfo(np.uint64).max:
                    df[col] = df[col].astype(np.uint64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        elif 'datetime' not in col_type.name and obj_to_category:
            df[col] = df[col].astype('category')
    gc.collect()
    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.3f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
class DataCompilation:
    def __init__(self, chunk_path=None):
        self._chunk_path = chunk_path
        self._chunk_dict = self._read_data(chunk_path)
        
    def _read_data(self, path):
        with open(path, 'rb') as input_file:
            chunk_data = pickle.load(input_file)
            return chunk_data
    
    def _write_data(self, df_data):
        df_data.to_csv(os.path.basename(self._chunk_path)+'.csv', index=False)
        
    def perform_stack(self, df, stacked_column_name=None):
        stacked_df = pd.DataFrame(df[stacked_column_name].str.split(' ').tolist(), index=df.customer_id).stack()
        stacked_df = stacked_df.reset_index()
        stacked_df.rename(columns={0:stacked_column_name}, inplace=True)
        return stacked_df

    def transform_df(self):
        df_final= pd.DataFrame()
        df_final = dd.from_pandas(df_final, npartitions=4)

        # Generate pandas dataframe
        df = pd.DataFrame(self._chunk_dict.items(), columns=['customer_id', 'customer_attrs'])

        # Create 2 new columns
        df['top_200_articles'] = df['customer_attrs'].apply(lambda x: x['top_200_articles'])
        df['top_200_scores'] = df['customer_attrs'].apply(lambda x: x['top_200_scores'])

        # Drop extra column
        df.drop(['customer_attrs'], axis = 1, inplace = True)

        # article stack
        df_article = self.perform_stack(df, 'top_200_articles')
        df_article = reduce_mem_usage(df_article)
        print(df_article.shape)

        # score stack
        df_score = self.perform_stack(df, 'top_200_scores')
        df_score = reduce_mem_usage(df_score)
        print(df_score.shape)

        # Merge both stacked dataframes
        df_final = pd.merge(df_article[['customer_id','top_200_articles']], df_score[['customer_id','top_200_scores', 'level_1']], how='left', on='customer_id')
        df_final = reduce_mem_usage(df_final)
        print(df_final.shape)
        
        del df_article
        del df_score
        gc.collect()
        
        return df_final

In [None]:
for chunk_path in tqdm.tqdm(chunk_path_list, desc='Iterating through chunks'):
    dc_instance = DataCompilation(chunk_path)
    chunk_df = dc_instance.transform_df()
    print(chunk_df.shape)
    dc_instance._write_data(chunk_df)
    del chunk_df
    gc.collect()