In [1]:
import pandas as pd
import numpy as np
import os
import heapq
import scipy.sparse as sparse

In [2]:
class Pre_Processer:
    def __init__(self):
        self.contentType = {"Statistic": "0", "Statista-Dossier": "1", "external Report": "2", "Industry Report": "3"}
        
    def read_data_full(self):
        self.data_users = pd.read_csv("../StatistaUsers10000.csv")
        self.data_tracking = pd.read_csv("../StatistaTracking10000.csv")
        
    def read_data_tracking(self):
        self.data_users = pd.read_csv("../StatistaUsers10000.csv")
        self.data_tracking = pd.read_excel("../data_tracking.xlsx")
        self.data_tracking = self.data_tracking.drop("Unnamed: 0", axis=1)
        
    def clean_data(self):        
        data_tracking = self.data_tracking[self.data_tracking.idContent != 0]
        data_tracking["fullId"] = data_tracking.apply(lambda x: (self.contentType[x["contentSubType"]] if x["contentSubType"] in self.contentType else "4") + "_" + str(x["idContent"] ), axis=1)
        data_tracking["date"] = data_tracking.apply(lambda x: pd.to_datetime(f"{x.day}-{x.month}-{x.year}"), axis=1)
        
        self.data_tracking = data_tracking
        self.data_tracking.to_excel(f"../data_tracking.xlsx")
        
    def get_sample(self, fractions, i_frame):        
        return np.array_split(self.data_tracking, fractions)[i_frame]
    
    def create_timesplit(self, date):
        self.training_set = self.data_tracking[self.data_tracking.date < date]
        self.test_set = self.data_tracking[self.data_tracking.date >= date]
        
    def create_matrix_sample(self, fractions = 1, i_frame = 0):
        if fractions != 1:
            if fractions < i_frame:
                i_frame = fractions

            self.data_tracking = self.get_sample(fractions, i_frame)
        
        frame = self.data_tracking[["idUser", "fullId"]]
        frame["views"] = np.ones([len(frame["fullId"]),1])
        frame = frame.groupby(by=["idUser", "fullId"]).sum().reset_index()
        
        user = list(np.sort(frame.idUser.unique()))
        content = list(np.sort(frame.fullId.unique()))
        views = list(frame.views)
        rows = frame.idUser.astype('category', categories = user).cat.codes 
        cols = frame.fullId.astype('category', categories = content).cat.codes 
        
        self.sparsity_ofdata = 1 - (len(views) / (len(content) * len(user)))
                
        self.matrix = sparse.csr_matrix((views, (rows, cols)), shape=(len(user), len(content)))
        size_name = self.get_size(fractions)
        sparse.save_npz(f"../ratings_matrix_{self.matrix_size}_{size_name}", self.matrix, compressed=True)
        
        self.matrix_asFrame = frame
        self.matrix_asFrame.to_excel(f"../data_{self.matrix_size}_{size_name}.xlsx")
        
        self.content_list = content
        
    def create_matrix_timesplit(self):
        training_set = self.training_set
        test_set = self.test_set
        
        # training_set
        frame = self.training_set[["idUser", "fullId"]]
        frame["views"] = np.ones([len(frame["fullId"]),1])
        frame = frame.groupby(by=["idUser", "fullId"]).sum().reset_index()
        
        user = list(np.sort(frame.idUser.unique()))
        content = list(np.sort(frame.fullId.unique()))
        views = list(frame.views)
        rows = frame.idUser.astype('category', categories = user).cat.codes 
        cols = frame.fullId.astype('category', categories = content).cat.codes 
        
        self.sparsity_ofdata = 1 - (len(views) / (len(content) * len(user)))
                
        self.matrix_csr = sparse.csr_matrix((views, (rows, cols)), shape=(len(user), len(content)))
        self.matrix_coo = sparse.coo_matrix((views, (rows, cols)), shape=(len(user), len(content)))
        sparse.save_npz(f"../ratings_matrix_csr", self.matrix_csr, compressed=True)
        sparse.save_npz(f"../ratings_matrix_coo", self.matrix_coo, compressed=True)
        
        self.training_set = frame
        self.training_set.to_excel(f"../training_set.xlsx")
        
        self.content_list = content
        
        # test_set
        frame = self.test_set[["idUser", "fullId"]]
        frame["views"] = np.ones([len(frame["fullId"]),1])
        frame = frame.groupby(by=["idUser", "fullId"]).sum().reset_index()
        
        self.test_set = frame
        self.test_set.to_excel(f"../test_set.xlsx")

In [3]:
Data = Pre_Processer()

In [None]:
Data.read_data_tracking()
# If fractions is less than 1, the data frame is going to get divided into pieces. Only the i_frame is going to get returned.
Data.clean_data()
Data.create_timesplit("2019-01-01")
Data.create_matrix_timesplit()
#Data.create_matrix(fractions=1, i_frame=47)

In [None]:
Data.matrix_asFrame.describe()

In [None]:
Data.matrix_asFrame.tail()

In [None]:
Data.matrix_asFrame[Data.matrix_asFrame.idUser == 5731].head()

In [None]:
Data.matrix.toarray()

In [None]:
int(len(Data.data_tracking.idUser))

In [None]:
Data.sparsity_ofdata

In [None]:
1 - Data.sparsity_ofdata

In [10]:
"18.5 Mio. Zeilen"
voller_datensatz = 18498789
"2.69 Mio. Zeilen"
n_2019 = 2688034

In [11]:
n_2019 / voller_datensatz

0.14530864696061996

In [4]:
Data.read_data_full()

In [45]:
data_tracking

Unnamed: 0,idUser,day,month,year,idContent,contentSubType,contentBranch,contentName,accessType,fullId,date
1919454,8489,31,12,2010,811,Statistic,104,In welchen Situationen empfinden Sie Stress?,download,0_811,2010-12-31
1919455,8489,31,12,2010,811,Statistic,104,In welchen Situationen empfinden Sie Stress?,view,0_811,2010-12-31
1919456,8489,31,12,2010,816,Statistic,76,Wie versuchen Sie Stress abzubauen?,view,0_816,2010-12-31
1919457,8489,31,12,2010,6803,Statistic,798,Was löst bei Ihnen am häufigsten Stress aus?,download,0_6803,2010-12-31
1919458,8489,31,12,2010,6803,Statistic,798,Was löst bei Ihnen am häufigsten Stress aus?,download,0_6803,2010-12-31
1919459,8489,31,12,2010,6803,Statistic,798,Was löst bei Ihnen am häufigsten Stress aus?,download,0_6803,2010-12-31
1919460,8489,31,12,2010,6803,Statistic,798,Was löst bei Ihnen am häufigsten Stress aus?,view,0_6803,2010-12-31
1919461,8489,31,12,2010,662,Statistic,798,Was löst bei Ihnen Stress aus?,download,0_662,2010-12-31
1919462,8489,31,12,2010,662,Statistic,798,Was löst bei Ihnen Stress aus?,view,0_662,2010-12-31
1919463,8489,31,12,2010,6806,Statistic,798,Was sind Ihre bevorzugten Mittel gegen Stress?,download,0_6806,2010-12-31


In [40]:
data_tracking = data_tracking[data_tracking.year < 2011]

In [39]:
data_tracking = Data.data_tracking[Data.data_tracking.idContent != 0]

In [44]:
contentType = {"Statistic": "0", "Statista-Dossier": "1", "external Report": "2", "Industry Report": "3"}
data_tracking["fullId"] = data_tracking.apply(lambda x: (contentType[x["contentSubType"]] if x["contentSubType"] in contentType else "4") + "_" + str(x["idContent"] ), axis=1)
data_tracking["date"] = data_tracking.apply(lambda x: pd.to_datetime(f"{x.day}-{x.month}-{x.year}"), axis=1)

In [46]:
data_tracking.to_excel(f"../data_tracking.xlsx")

In [47]:
training_set = data_tracking[data_tracking.date < "2010-06-01"]
test_set = data_tracking[data_tracking.date >= "2010-06-01"]

In [49]:
# training_set
frame = training_set[["idUser", "fullId"]]
frame["views"] = np.ones([len(frame["fullId"]),1])
frame = frame.groupby(by=["idUser", "fullId"]).sum().reset_index()

user = list(np.sort(frame.idUser.unique()))
content = list(np.sort(frame.fullId.unique()))
views = list(frame.views)
rows = frame.idUser.astype('category', categories = user).cat.codes 
cols = frame.fullId.astype('category', categories = content).cat.codes 

sparsity_ofdata = 1 - (len(views) / (len(content) * len(user)))

matrix_csr = sparse.csr_matrix((views, (rows, cols)), shape=(len(user), len(content)))
matrix_coo = sparse.coo_matrix((views, (rows, cols)), shape=(len(user), len(content)))
sparse.save_npz(f"../ratings_matrix_csr", matrix_csr, compressed=True)
sparse.save_npz(f"../ratings_matrix_coo", matrix_coo, compressed=True)

training_set = frame
training_set.to_excel(f"../training_set.xlsx")

# test_set
frame = test_set.sort_values(by='date')
frame = frame[["idUser", "fullId"]]
frame["views"] = np.ones([len(frame["fullId"]),1])
frame = frame.groupby(by=["idUser", "fullId"]).sum().reset_index()


test_set = frame
test_set.to_excel(f"../test_set.xlsx")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
  exec(code_obj, self.user_global_ns, self.user_ns)


In [27]:
content[10]

'4_1026'

In [58]:
np.save("../content_ids", content)

In [60]:
content[0]

'0_1004'

In [61]:
test_set

Unnamed: 0,idUser,fullId,views
0,130,0_1621,1.0
1,130,0_3209,1.0
2,130,0_5069,1.0
3,164,0_1183,1.0
4,164,0_163502,2.0
5,164,0_2631,1.0
6,164,0_73191,1.0
7,164,0_73201,1.0
8,185,0_4972,1.0
9,185,0_71907,1.0
