## Pre-Processing v3.0
### Nutzung: Import der Daten, Cleanup, Generierung der Rating Matrix, Testdatensatz, Popularity Data.

In [None]:
import pandas as pd
import numpy as np
import os
import heapq
import scipy.sparse as sparse

In [None]:
class Pre_Processer:
    def __init__(self):
        self.contentType = {"Statistic": "0", "Statista-Dossier": "1", "external Report": "2", "Industry Report": "3"}
        
    def read_data_full(self):
        self.data_users = pd.read_csv("../StatistaUsers10000.csv")
        self.data_tracking = pd.read_csv("../StatistaTracking10000.csv")

        return self.data_users, self.data_tracking
    
    def read_pickles(self):
        self.training_set = pd.read_pickle(f"../training_set.pkl", compression="zip")
        self.test_set = pd.read_pickle(f"../test_set.pkl", compression="zip")
        
        return self.training_set, self.test_set
    
    def generate_fullID_vectorized(self, subType, contentId):
        array = []
        for i in range(len(subType)):
            array.append((self.contentType[subType[i]] if subType[i] in self.contentType else "4") + "_" + str(contentId[i]))

        return array
    
    def clean_tracking(self, tracking_data, start_year = 2017, end_date = None):
        data = tracking_data[tracking_data.idContent != 0]
        data = data.iloc[:, 0:6]
        data = data[data.year >= start_year]
        data["date"] = data['year'].astype(str) + "-" + data['month'].astype(str) + "-" + data["day"].astype(str)
        if end_date != None:
            data = data[data.date <= end_date]
        data["fullId"] = self.generate_fullID_vectorized(data["contentSubType"].values, data["idContent"].values)

        self.data_tracking = data
        data.to_pickle(f"../data_tracking.pkl", compression="zip")
        return data
    
    def create_timesplit(self, data, date):
        self.training_set = data[data.date < date]
        self.test_set = data[data.date >= date]
        
        self.training_set.to_pickle(f"../training_set.pkl", compression="zip")
        self.test_set.to_pickle(f"../test_set.pkl", compression="zip")
        return self.training_set, self.test_set
    
    def generate_popularity_vector(self, data):
        data = pd.DataFrame(data["fullId"])
        data["views"] = np.ones([len(data["fullId"]),1])
        data = data.groupby(by=["fullId"]).sum().reset_index()
        data = data.sort_values(by=["views"], ascending=False)
        
        return data
    
    def create_matrices(self, training_set, test_set):
        # training_set
        frame = training_set[["idUser", "fullId"]]
        frame["views"] = np.ones([len(frame["fullId"]),1])
        frame = frame.groupby(by=["idUser", "fullId"]).sum().reset_index()

        user = list(np.sort(frame.idUser.unique()))
        content = list(np.sort(frame.fullId.unique()))
        views = list(frame.views)
        rows = frame.idUser.astype('category').cat.codes 
        cols = frame.fullId.astype('category').cat.codes 
        
        self.sparsity_ofdata = 1 - (len(views) / (len(content) * len(user)))

        matrix_csr = sparse.csr_matrix((views, (rows, cols)), shape=(len(user), len(content)))
        matrix_coo = sparse.coo_matrix((views, (rows, cols)), shape=(len(user), len(content)))
        sparse.save_npz(f"../ratings_matrix_csr", matrix_csr, compressed=True)
        sparse.save_npz(f"../ratings_matrix_coo", matrix_coo, compressed=True)

        training_set = frame
        self.training_set.to_pickle(f"../training_set_aggregated.pkl", compression="zip")
        
        self.matrix_csr = matrix_csr
        self.matrix_coo = matrix_coo
        
        self.user = user
        self.content = content
        self.views = views
        self.rows = rows
        self.cols = cols
        
        np.save("../user_ids", user)
        np.save("../content_ids", content)

In [None]:
# Initialize the object.
Data = Pre_Processer()

In [None]:
# Starting the full pre-processing run.
user_data, data_tracking = Data.read_data_full()

In [None]:
data_tracking = Data.clean_tracking(data_tracking, start_year=2017, end_date = "2019-7-31")

In [None]:
training_set, test_set = Data.create_timesplit(data_tracking, "2019-7-1")
Data.create_matrices(training_set, test_set)

In [None]:
# Generate & save the popularity data vector.
popularity_data = training_set[training_setining_set.date > "2019-4-30"]
popularity_data = Data.generate_popularity_vector(popularity_data)
np.save("../popularity_data", popularity_data.fullId.values[:5000])