In [2]:
import pandas as pd
import numpy as np
import os
import heapq
import scipy.sparse as sparse

In [8]:
class Pre_Processer:
    def __init__(self):
        self.size = 150*10**3
        self.size_name = "medium"
        
    def read_data(self):
        self.data_users = pd.read_excel("Data.xlsx", sheet_name="Users")
        self.data_tracking = pd.read_excel("Data.xlsx", sheet_name="Tracking")
        
    def read_processed_data(self):
        # Read_Data: load user data from csv.
        self.user_data = pd.read_csv(f"df_user_data_{self.size_name}.csv")
        self.user_data = self.user_data.drop("Unnamed: 0", axis=1)
        
    def read_df_ratings_matrix(self):
        self.matrix_asFrame = pd.read_csv(f"df_ratings_matrix_{self.size_name}.csv")
        self.matrix_asFrame = self.matrix_asFrame.set_index("Unnamed: 0")
        self.matrix_asFrame.index.name = "UserID"
        
    def pre_processing(self):
        print("You are starting with Pre-Processing step 1 here. Otherwise run read_processed_data()")
        
        self.data_tracking = self.data_tracking.drop(labels=[' accessType;;;;;', 'Unnamed: 9',
        'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13',
        'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17',
        'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21',
        'Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25',
        'Unnamed: 26', 'Unnamed: 27', 'Unnamed: 28', 'Unnamed: 29',
        'Unnamed: 30', 'Unnamed: 31', 'Unnamed: 32', 'Unnamed: 33'], axis=1)
        
        self.data_tracking.columns = ['idUser', 'day', 'month', 'year', 'idContent', 'contentSubType',
               'contentBranch', 'contentName']
    
    def set_ratings_size(self, exponent):
        self.size = 150*10**exponent
        
        if exponent == 1:
            self.size_name = "extra_small"
        elif exponent == 2:
            self.size_name = "small"
        elif exponent == 3:
            self.size_name = "medium"
        elif exponent == 4:
            self.size_name = "large"
        elif exponent == 5:
            self.size_name = "extra_large"
        else:
            self.size_name = "different_size"
            
        print(f"Data Frame size set to: {self.size_name}.")
            
        
    def create_ratings_data(self):
        # v2.0 Get all the viewed content for each user
        user_data = pd.DataFrame(columns=["idUser", "idContent", "timesViewed"])
        data_dict = {"idUser": "user", "idContent": "content", "timesViewed": "viewed"}
        user = []
        content = []
        viewed = []

        for i in range(0, len(self.data_tracking["idUser"])):
            if i == self.size:
                break

            # Save & Create new user_list if this row starts a new users data.
            if i != 0 and self.data_tracking["idUser"].iloc[i] != self.data_tracking["idUser"].iloc[i-1]:
                data_dict = {"idUser": user, "idContent": content, "timesViewed": viewed}
                this_user_data = pd.DataFrame.from_dict(data_dict)
                user_data = pd.concat([user_data, this_user_data])
                user = []
                content = []
                viewed = []

            item_found = False

            for j in range(len(content), 0, -1):
                if content[j-1] == self.data_tracking["idContent"].iloc[i]:
                    viewed[j-1] += 1  
                    item_found = True
                    break

            if len(content) == 0 or item_found == False:
                user.append(self.data_tracking["idUser"].iloc[i]) 
                content.append(self.data_tracking["idContent"].iloc[i]) 
                viewed.append(1)

        # save the user data frame to save time
        user_data.to_csv(f"df_user_data_{self.size_name}.csv")   
        
    def create_ratings_dataframe(self):
        print("You are starting with Pre-Processing step 2 here. Otherwise run pre_processing()")
        
        num_contents = 0
        num_users = 0
        id_contents = list()
        id_users = list()
        
        for id in Data.user_data["idContent"]:
            if id not in id_contents:
                id_contents.append(id)
                num_contents += 1
                
        for id in Data.user_data["idUser"]:
            if id not in id_users:
                id_users.append(id)
                num_users += 1
                
        self.matrix_asFrame = pd.DataFrame(index=id_users, columns=id_contents)
        
        # create unary matrix
        """for user in self.matrix_asFrame.index:
            for content in self.user_data[self.user_data["idUser"] == user]["idContent"]:
                    self.matrix_asFrame.loc[user, content] = 1"""
        
        # create numerical matrix with view counts
        for user in self.matrix_asFrame.index:
            for j in range(0, len(self.user_data[self.user_data["idUser"] == user]["idContent"])):
                    content = self.user_data[self.user_data["idUser"] == user].iloc[j]["idContent"]
                    views = self.user_data[self.user_data["idUser"] == user].iloc[j]["timesViewed"]
                    self.matrix_asFrame.loc[user, content] = views

        self.matrix_asFrame = self.matrix_asFrame.fillna(0)
        
        # save the user data frame to save time
        self.matrix_asFrame.to_csv(f"df_ratings_matrix_{self.size_name}.csv") 

    def create_ratings_matrix(self):
        print("You are starting with Pre-Processing step 3 here. Otherwise run pre_processing()")

        self.ratings_matrix = self.matrix_asFrame.values
        np.save(f"ratings_matrix_{self.size_name}", self.ratings_matrix)
    
    def create_sparse_ratings_matrix(self):
        print("You are starting with Pre-Processing step 3 here. Otherwise run pre_processing()")
        
        self.sparse_matrix = sparse.csr_matrix(self.matrix_asFrame.values)
        sparse.save_npz(f"ratings_matrix_{self.size_name}", self.sparse_matrix, compressed=True)

In [9]:
Data = Pre_Processer()

In [None]:
# Full pre-processing step 1. Optional: set_ratings_size(). If not set 150*10**3 = 150.000 rows of tracking data will be read.
Data.read_data()
Data.pre_processing()
Data.set_ratings_size(2)
Data.create_ratings_data()

In [38]:
# If Pre-Processing Step 1 is already done.
Data.set_ratings_size(3)
Data.read_processed_data()
Data.create_ratings_dataframe()

Data Frame size set to: medium.
You are starting with Pre-Processing step 2 here. Otherwise run pre_processing()


In [11]:
# If Pre-Processing Step 1 & 2 are already done.
Data.set_ratings_size(2)
Data.read_df_ratings_matrix()
#Data.create_ratings_matrix()
Data.create_sparse_ratings_matrix()

Data Frame size set to: small.
You are starting with Pre-Processing step 3 here. Otherwise run pre_processing()


In [43]:
Data.matrix_asFrame.head(10)

Unnamed: 0_level_0,175415,170458,182197,163274,169625,168536,163193,163191,175430,175419,...,17839,6348,16098,17877,7610,624170,295519,375209,860706,871026
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17208,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
20490,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25554,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26940,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29023,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31451,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33139,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38156,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39646,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
44362,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [62]:
Data.matrix_asFrame.mean().nlargest(10)

0           0.262045
264810      0.024677
272014      0.021152
243         0.019976
200         0.017626
3979        0.011751
70000000    0.011751
270291      0.011751
271496      0.011751
276623      0.011751
dtype: float64

In [65]:
0.024677 * 851

21.000127

In [67]:
Data.matrix_asFrame["264810"][Data.matrix_asFrame["264810"] > 0]

UserID
323811     1
510587     1
580991     1
779764     1
993682     1
1037458    1
1062591    1
1066778    1
1244472    1
1307724    1
1328238    1
1490970    1
1571724    1
1614781    1
1668070    1
1771680    1
1855214    1
1890625    1
1943733    1
1980007    1
1992377    1
Name: 264810, dtype: int64