In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
from typing import List
from datetime import datetime

In [2]:
def load_data(filepaths: List[str], import_date=False):
    '''Generate a list of all data entries from the given list of file paths
    '''
    data = dict(
        item_id=[],
        user_id=[],
        item_index=[],
        user_index=[],
        rating=[]
    )
    if import_date:
        data['date'] = []
    user_id_map = {}
    item_id_map = {}
    item_id = 0
    # max_user_id = 0
    # max_item_id = 0
    for fname in filepaths:
        with open(fname, 'r') as input_file:
            for line in input_file:
                if line[-2] == ':':
                    # Set item_id when we encouter a new section of the data
                    item_id = int(line[:-2])
                else:
                    # Line has a user rating in it, capture the values and store in array
                    [user_id, rating, date] = line.split(',')
                    user_id = int(user_id)
                    if import_date:
                        date = datetime.strptime(date.rstrip(), '%Y-%m-%d')

                    if (not item_id in item_id_map):
                        item_id_map[item_id] = len(item_id_map)
                    if (not user_id in user_id_map):
                        user_id_map[user_id] = len(user_id_map)

                    rating = float(rating)
                    
                    # if user_id > max_user_id:
                    #     max_user_id = user_id
                    # if item_id > max_item_id:
                    #     max_item_id = item_id
                    # data.append(DataPoint(item_id, user_id, rating))

                    data['item_id'].append(item_id)
                    data['user_id'].append(user_id)
                    data['item_index'].append(item_id_map[item_id])
                    data['user_index'].append(user_id_map[user_id])
                    data['rating'].append(rating)
                    if import_date:
                        data['date'].append(date)
    data = pd.DataFrame(data)
    return (dict(
        n_items=len(item_id_map),
        n_users=len(user_id_map),
        data=data
    ),
    user_id_map,
    item_id_map)

In [12]:
datafiles = ['./data/combined_data_1.txt', './data/combined_data_2.txt', './data/combined_data_3.txt', './data/combined_data_4.txt']
(dataset, user_id_map, item_id_map) = load_data(datafiles, import_date=True)
# (dataset, user_id_map, item_id_map) = load_data(['./data/small_combined_data_1.txt'], import_date=True)
print(dataset)

{'n_items': 17770, 'n_users': 480189, 'data':            item_id  user_id  item_index  user_index  rating       date
0                1  1488844           0           0     3.0 2005-09-06
1                1   822109           0           1     5.0 2005-05-13
2                1   885013           0           2     4.0 2005-10-19
3                1    30878           0           3     4.0 2005-12-26
4                1   823519           0           4     3.0 2004-05-03
...            ...      ...         ...         ...     ...        ...
100480502    17770  1790158       17769         542     4.0 2005-11-01
100480503    17770  1608708       17769       29273     3.0 2005-07-19
100480504    17770   234275       17769       29251     1.0 2004-08-07
100480505    17770   255278       17769       22793     4.0 2004-05-28
100480506    17770   453585       17769       76269     2.0 2005-03-10

[100480507 rows x 6 columns]}


In [None]:
px.bar((dataset['data']))

In [7]:
dataset['data']['rating'].size

100480507

In [9]:
np.max(dataset['data']['user_id'])

2649429

In [13]:
px.histogram(dataset['data'], x='date')

KeyboardInterrupt: 