In [1]:
import pandas as pd
import numpy as np
import os
import heapq
import scipy.sparse as sparse

In [2]:
class Pre_Processer:
    def __init__(self):
        self.contentType = {"Statistic": "0", "Statista-Dossier": "1", "external Report": "2", "Industry Report": "3"}
        
    def read_data_full(self):
        self.data_users = pd.read_csv("../StatistaUsers10000.csv")
        self.data_tracking = pd.read_csv("../StatistaTracking10000.csv")
        
    def read_data_tracking(self):
        self.data_users = pd.read_csv("../StatistaUsers10000.csv")
        self.data_tracking = pd.read_excel("../data_tracking.xlsx")
        self.data_tracking = self.data_tracking.drop("Unnamed: 0", axis=1)
        
    def clean_data(self):        
        data_tracking = self.data_tracking[self.data_tracking.idContent != 0]
        data_tracking["fullId"] = data_tracking.apply(lambda x: (self.contentType[x["contentSubType"]] if x["contentSubType"] in self.contentType else "4") + "_" + str(x["idContent"] ), axis=1)
        data_tracking["date"] = data_tracking.apply(lambda x: pd.to_datetime(f"{x.day}-{x.month}-{x.year}"), axis=1)
        
        self.data_tracking = data_tracking
        self.data_tracking.to_excel(f"../data_tracking.xlsx")
        
    def get_sample(self, fractions, i_frame):        
        return np.array_split(self.data_tracking, fractions)[i_frame]
    
    def create_timesplit(self, date):
        self.training_set = self.data_tracking[self.data_tracking.date < date]
        self.test_set = self.data_tracking[self.data_tracking.date >= date]
        
    def create_matrix_sample(self, fractions = 1, i_frame = 0):
        if fractions != 1:
            if fractions < i_frame:
                i_frame = fractions

            self.data_tracking = self.get_sample(fractions, i_frame)
        
        frame = self.data_tracking[["idUser", "fullId"]]
        frame["views"] = np.ones([len(frame["fullId"]),1])
        frame = frame.groupby(by=["idUser", "fullId"]).sum().reset_index()
        
        user = list(np.sort(frame.idUser.unique()))
        content = list(np.sort(frame.fullId.unique()))
        views = list(frame.views)
        rows = frame.idUser.astype('category', categories = user).cat.codes 
        cols = frame.fullId.astype('category', categories = content).cat.codes 
        
        self.sparsity_ofdata = 1 - (len(views) / (len(content) * len(user)))
                
        self.matrix = sparse.csr_matrix((views, (rows, cols)), shape=(len(user), len(content)))
        size_name = self.get_size(fractions)
        sparse.save_npz(f"../ratings_matrix_{self.matrix_size}_{size_name}", self.matrix, compressed=True)
        
        self.matrix_asFrame = frame
        self.matrix_asFrame.to_excel(f"../data_{self.matrix_size}_{size_name}.xlsx")
        
        self.content_list = content
        
    def create_matrix_timesplit(self):
        training_set = self.training_set
        test_set = self.test_set
        
        # training_set
        frame = self.training_set[["idUser", "fullId"]]
        frame["views"] = np.ones([len(frame["fullId"]),1])
        frame = frame.groupby(by=["idUser", "fullId"]).sum().reset_index()
        
        user = list(np.sort(frame.idUser.unique()))
        content = list(np.sort(frame.fullId.unique()))
        views = list(frame.views)
        rows = frame.idUser.astype('category', categories = user).cat.codes 
        cols = frame.fullId.astype('category', categories = content).cat.codes 
        
        self.sparsity_ofdata = 1 - (len(views) / (len(content) * len(user)))
                
        self.matrix_csr = sparse.csr_matrix((views, (rows, cols)), shape=(len(user), len(content)))
        self.matrix_coo = sparse.coo_matrix((views, (rows, cols)), shape=(len(user), len(content)))
        sparse.save_npz(f"../ratings_matrix_csr", self.matrix_csr, compressed=True)
        sparse.save_npz(f"../ratings_matrix_coo", self.matrix_coo, compressed=True)
        
        self.training_set = frame
        self.training_set.to_excel(f"../training_set.xlsx")
        
        self.content_list = content
        
        # test_set
        frame = self.test_set[["idUser", "fullId"]]
        frame["views"] = np.ones([len(frame["fullId"]),1])
        frame = frame.groupby(by=["idUser", "fullId"]).sum().reset_index()
        
        self.test_set = frame
        self.test_set.to_excel(f"../test_set.xlsx")

In [3]:
Data = Pre_Processer()

In [None]:
Data.read_data_tracking()
# If fractions is less than 1, the data frame is going to get divided into pieces. Only the i_frame is going to get returned.
Data.clean_data()
Data.create_timesplit("2019-01-01")
Data.create_matrix_timesplit()
#Data.create_matrix(fractions=1, i_frame=47)

In [None]:
Data.matrix_asFrame.describe()

In [None]:
Data.matrix_asFrame.tail()

In [None]:
Data.matrix_asFrame[Data.matrix_asFrame.idUser == 5731].head()

In [None]:
Data.matrix.toarray()

In [None]:
int(len(Data.data_tracking.idUser))

In [None]:
Data.sparsity_ofdata

In [None]:
1 - Data.sparsity_ofdata

In [10]:
"18.5 Mio. Zeilen"
voller_datensatz = 18498789
"2.69 Mio. Zeilen"
n_2019 = 2688034

In [11]:
n_2019 / voller_datensatz

0.14530864696061996

In [4]:
Data.read_data_full()

In [82]:
data_tracking

Unnamed: 0,idUser,day,month,year,idContent,contentSubType,contentBranch,contentName,accessType
1204260,200256,31,12,2012,266616,Statistic,"459, 990","Leading financial adivsors to global M&A 2014,...",download
1204261,200256,31,12,2012,266616,Statistic,"459, 990","Leading financial adivsors to global M&A 2014,...",view
1204262,200256,31,12,2012,269845,Statistic,"459, 990","Largest banks worldwide 2014, by assets",view
1204263,200256,31,12,2012,269845,Statistic,"459, 990","Largest banks worldwide 2014, by assets",view
1204264,200256,31,12,2012,9635,external Report,941,U.S. Markets Construction Overview 2012,view
1204265,228643,31,12,2012,230154,Statistic,477,Sunday newspapers: Average number of readers o...,view
1204266,227684,31,12,2012,200248,Statistic,"485, 698",Forecast: tablet PC sales in the United States...,view
1204267,186287,31,12,2012,186590,Statistic,697,Telecommunication and interconnected VoIP reve...,view
1204268,186287,31,12,2012,218990,Statistic,1007,"Global M2M market size from 2009 to 2013, by t...",view
1204269,186287,31,12,2012,5552,external Report,481,Second Annual Report and Analysis of Competiti...,view


In [81]:
data_tracking = data_tracking[data_tracking.year < 2013]

In [80]:
data_tracking = Data.data_tracking[Data.data_tracking.idContent != 0]

In [83]:
contentType = {"Statistic": "0", "Statista-Dossier": "1", "external Report": "2", "Industry Report": "3"}
data_tracking["fullId"] = data_tracking.apply(lambda x: (contentType[x["contentSubType"]] if x["contentSubType"] in contentType else "4") + "_" + str(x["idContent"] ), axis=1)
data_tracking["date"] = data_tracking.apply(lambda x: pd.to_datetime(f"{x.day}-{x.month}-{x.year}"), axis=1)

In [46]:
data_tracking.to_excel(f"../data_tracking.xlsx")

In [84]:
training_set = data_tracking[data_tracking.date < "2019-01-01"]
test_set = data_tracking[data_tracking.date >= "2019-01-01"]

In [86]:
# training_set
frame = training_set[["idUser", "fullId"]]
frame["views"] = np.ones([len(frame["fullId"]),1])
frame = frame.groupby(by=["idUser", "fullId"]).sum().reset_index()

user = list(np.sort(frame.idUser.unique()))
content = list(np.sort(frame.fullId.unique()))
views = list(frame.views)
rows = frame.idUser.astype('category', categories = user).cat.codes 
cols = frame.fullId.astype('category', categories = content).cat.codes 

sparsity_ofdata = 1 - (len(views) / (len(content) * len(user)))

matrix_csr = sparse.csr_matrix((views, (rows, cols)), shape=(len(user), len(content)))
matrix_coo = sparse.coo_matrix((views, (rows, cols)), shape=(len(user), len(content)))
sparse.save_npz(f"../ratings_matrix_csr", matrix_csr, compressed=True)
sparse.save_npz(f"../ratings_matrix_coo", matrix_coo, compressed=True)

training_set = frame
training_set.to_excel(f"../training_set.xlsx")

# test_set
frame = test_set.sort_values(by='date')
frame = frame[["idUser", "fullId"]]
frame["views"] = np.ones([len(frame["fullId"]),1])
frame = frame.groupby(by=["idUser", "fullId"]).sum().reset_index()


test_set = frame
test_set.to_excel(f"../test_set.xlsx")

In [87]:
np.save("../user_ids", user)
np.save("../content_ids", content)