## Pre-Processing v3.0
### Vectorized.


In [1]:
import pandas as pd
import numpy as np
import os
import heapq
import scipy.sparse as sparse

In [18]:
class Pre_Processer:
    def __init__(self):
        self.contentType = {"Statistic": "0", "Statista-Dossier": "1", "external Report": "2", "Industry Report": "3"}
        
    def read_data_full(self):
        self.data_users = pd.read_csv("../StatistaUsers10000.csv")
        self.data_tracking = pd.read_csv("../StatistaTracking10000.csv")

        return self.data_users, self.data_tracking
    
    def read_pickles(self):
        self.training_set = pd.read_pickle(f"../training_set.pkl", compression="zip")
        self.test_set = pd.read_pickle(f"../test_set.pkl", compression="zip")
        
        return self.training_set, self.test_set
    
    def generate_fullID_vectorized(self, subType, contentId):
        array = []
        for i in range(len(subType)):
            array.append((self.contentType[subType[i]] if subType[i] in self.contentType else "4") + "_" + str(contentId[i]))

        return array
    
    def clean_tracking(self, tracking_data, start_year = 2017, end_date = None):
        data = tracking_data[tracking_data.idContent != 0]
        data = data.iloc[:, 0:6]
        data = data[data.year >= start_year]
        data["date"] = data['year'].astype(str) + "-" + data['month'].astype(str) + "-" + data["day"].astype(str)
        if end_date != None:
            data = data[data.date <= end_date]
        data["fullId"] = self.generate_fullID_vectorized(data["contentSubType"].values, data["idContent"].values)

        self.data_tracking = data
        data.to_pickle(f"../data_tracking.pkl", compression="zip")
        return data
    
    def create_timesplit(self, data, date):
        self.training_set = data[data.date < date]
        self.test_set = data[data.date >= date]
        
        self.training_set.to_pickle(f"../training_set.pkl", compression="zip")
        self.test_set.to_pickle(f"../test_set.pkl", compression="zip")
        return self.training_set, self.test_set
    
    def create_matrices(self, training_set, test_set):
        # training_set
        frame = training_set[["idUser", "fullId"]]
        frame["views"] = np.ones([len(frame["fullId"]),1])
        frame = frame.groupby(by=["idUser", "fullId"]).sum().reset_index()

        user = list(np.sort(frame.idUser.unique()))
        content = list(np.sort(frame.fullId.unique()))
        views = list(frame.views)
        rows = frame.idUser.astype('category').cat.codes 
        cols = frame.fullId.astype('category').cat.codes 
        
        self.sparsity_ofdata = 1 - (len(views) / (len(content) * len(user)))

        matrix_csr = sparse.csr_matrix((views, (rows, cols)), shape=(len(user), len(content)))
        matrix_coo = sparse.coo_matrix((views, (rows, cols)), shape=(len(user), len(content)))
        sparse.save_npz(f"../ratings_matrix_csr", matrix_csr, compressed=True)
        sparse.save_npz(f"../ratings_matrix_coo", matrix_coo, compressed=True)

        training_set = frame
        self.training_set.to_pickle(f"../training_set_aggregated.pkl", compression="zip")
        
        self.matrix_csr = matrix_csr
        self.matrix_coo = matrix_coo
        
        self.user = user
        self.content = content
        self.views = views
        self.rows = rows
        self.cols = cols
        
        np.save("../user_ids", user)
        np.save("../content_ids", content)

        # test_set
        """frame = test_set.sort_values(by='date')
        frame = frame[["idUser", "fullId"]]
        frame["views"] = np.ones([len(frame["fullId"]),1])
        frame = frame.groupby(by=["idUser", "fullId"]).sum().reset_index()


        test_set = frame
        #test_set.to_excel(f"../test_set.xlsx")
        test_set.to_pickle(f"../test_set")"""

### Pre Processing run:

In [19]:
Data = Pre_Processer()

In [20]:
user_data, data_tracking = Data.read_data_full()

In [21]:
data_tracking = Data.clean_tracking(data_tracking, start_year=2017, end_date = "2019-7-31")

In [None]:
training_set, test_set = Data.create_timesplit(data_tracking, "2019-7-1")
Data.create_matrices(training_set, test_set)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


#### Load Pickles:

In [10]:
training_set, test_set = Data.read_pickles()

### Analysis:

In [None]:
data_tracking.describe()

In [None]:
data_tracking.tail()

In [None]:
data_tracking[data_tracking.idUser == 5731].head()

In [None]:
int(len(data_tracking.idUser))

In [16]:
Data.sparsity_ofdata

0.9921601970895453

In [17]:
1 - Data.sparsity_ofdata

0.007839802910454718

In [None]:
"10 Mio. Zeilen"
voller_datensatz = 10019879
"1.54 Mio. Zeilen"
n_test_april = 1543999

In [None]:
n_test_april / voller_datensatz

In [13]:
len(test_set.day) / len(training_set.day) 

0.12358672501015389

In [19]:
Data.matrix_csr.toarray()[indexUser][indexContent]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
df = training_set[["idUser", "fullId"]]
df["views"] = np.ones([len(df["fullId"]),1])
df = df.groupby(by=["idUser", "fullId"]).sum().reset_index()

In [None]:
user = np.sort(df.idUser.unique())
content = np.sort(df.fullId.unique())
views = list(df.views)
rows = df.idUser.astype('category', categories = user).cat.codes 
cols = df.fullId.astype('category', categories = content).cat.codes 

In [161]:
indxUser = np.where(user==699743)[0][0]
indxContent = np.where(content=="4_9817")[0][0]

In [None]:
matrix_csr = sparse.csr_matrix((views, (rows, cols)), shape=(len(user), len(content)))
matrix_coo = sparse.coo_matrix((views, (rows, cols)), shape=(len(user), len(content)))

In [162]:
matrix_csr.toarray()[indxUser][indxContent]

6.0

In [166]:
matrix_csr.toarray().shape

(851, 436710)