In [1]:
import pandas as pd
import numpy as np
import os
import heapq
import scipy.sparse as sparse

In [19]:
class Pre_Processer:
    def __init__(self):
        self.contentType = {"Statistic": "0", "Statista-Dossier": "1", "external Report": "2", "Industry Report": "3"}
        
    def read_data_small(self):
        self.matrix_size = "500"
        self.data_users = pd.read_csv("../StatistaUsers500.csv")
        self.data_tracking = pd.read_csv("../StatistaTracking500.csv")
        
    def read_data_full(self):
        self.matrix_size = "10000"
        self.data_users = pd.read_csv("../StatistaUsers10000.csv")
        self.data_tracking = pd.read_csv("../StatistaTracking10000.csv")
        
    def read_data_tracking(self):
        self.matrix_size = "10000"
        self.data_users = pd.read_csv("../StatistaUsers10000.csv")
        self.data_tracking = pd.read_excel("../data_tracking.xlsx")
        
    def get_size(self, fractions):
        if fractions == 1:
            return "full"
        elif fractions == 100:
            return "large"
        elif fractions == 500:
            return "medium"
        elif fractions == 1000:
            return "small"
        elif fractions == 10000:
            return "extra_small"
        elif fractions == 25000:
            return "really_small"
        else:
            return "different_size"
        
    def clean_data(self):        
        data_tracking = self.data_tracking[self.data_tracking.idContent != 0]
        data_tracking["fullId"] = data_tracking.apply(lambda x: (self.contentType[x["contentSubType"]] if x["contentSubType"] in self.contentType else "4") + "_" + str(x["idContent"] ), axis=1)
        
        self.data_tracking = data_tracking
        self.data_tracking.to_excel(f"../data_tracking.xlsx")
        
    def get_sample(self, fractions, i_frame):        
        return np.array_split(self.data_tracking, fractions)[i_frame]
        
    def create_matrix(self, fractions = 1, i_frame = 0):
        if fractions != 1:
            if fractions < i_frame:
                i_frame = fractions
                
            self.data_tracking = self.get_sample(fractions, i_frame)
        
        frame = self.data_tracking[["idUser", "fullId"]]
        frame["views"] = np.ones([len(frame["fullId"]),1])
        frame = frame.groupby(by=["idUser", "fullId"]).sum().reset_index()
        
        user = list(np.sort(frame.idUser.unique()))
        content = list(np.sort(frame.fullId.unique()))
        views = list(frame.views)
        rows = frame.idUser.astype('category', categories = user).cat.codes 
        cols = frame.fullId.astype('category', categories = content).cat.codes 
        
        self.sparsity_ofdata = 1 - (len(views) / (len(content) * len(user)))
                
        self.matrix = sparse.csr_matrix((views, (rows, cols)), shape=(len(user), len(content)))
        size_name = self.get_size(fractions)
        sparse.save_npz(f"../ratings_matrix_{self.matrix_size}_{size_name}", self.matrix, compressed=True)
        
        self.matrix_asFrame = frame
        self.matrix_asFrame.to_excel(f"../data_{self.matrix_size}_{size_name}.xlsx")

In [20]:
Data = Pre_Processer()

In [None]:
Data.read_data_full()
# If fractions is less than 1, the data frame is going to get divided into pieces. Only the i_frame is going to get returned.
Data.clean_data()
Data.create_matrix(fractions=1, i_frame=47)

In [24]:
Data.matrix_asFrame.describe()

Unnamed: 0,idUser,views
count,502306.0,502306.0
mean,197294.130231,2.087522
std,92685.473566,3.63281
min,168.0,1.0
25%,145082.0,1.0
50%,225428.0,1.0
75%,257262.0,2.0
max,394179.0,736.0


In [25]:
Data.matrix_asFrame.tail()

Unnamed: 0,idUser,fullId,views
502301,394179,2_14881,1.0
502302,394179,2_3394,1.0
502303,394179,2_6519,1.0
502304,394179,2_9163,1.0
502305,394179,4_13110,2.0


In [26]:
Data.matrix_asFrame[Data.matrix_asFrame.idUser == 5731].head()

Unnamed: 0,idUser,fullId,views
13140,5731,0_1196,1.0
13141,5731,0_1236,1.0
13142,5731,0_12528,3.0
13143,5731,0_12555,2.0
13144,5731,0_12862,5.0


In [27]:
Data.matrix.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [28]:
int(len(Data.data_tracking.idUser))

1048575

In [21]:
Data.read_data_tracking()
Data.create_matrix(fractions=1, i_frame=47)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [29]:
Data.sparsity_ofdata

0.9969177741551302

In [30]:
1 - Data.sparsity_ofdata

0.0030822258448698348