## Criando módulos em Python para load dos dados

O objetivo aqui é testarmos as principais diferenças entre matrizes esparsas e matrizes densas.

In [1]:
%%writefile data.py
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity


def get_data(path):
  return pd.read_csv(path)


def start_pipeline(dataf):
  return dataf.copy()


def get_book_counts(dataf):
  return dataf.groupby(by='ISBN', as_index=False) \
    .agg({'User-ID': pd.Series.nunique}) \
    .rename(columns={'User-ID': 'User-ID_count'}) \
    .set_index('ISBN') \
    .sort_values('User-ID_count', ascending = False)


def get_n_top_values(dataf, n):
  return dataf.head(n) \
    .index \
    .tolist()


def format_sample(dataf):
  return pd.DataFrame({
      "user": dataf["User-ID"],
      "book": dataf["ISBN"],
      "action": dataf["Book-Rating"]
  })


def get_top_books_ids(dataf, sample_size=1000):
  return dataf \
    .pipe(start_pipeline) \
    .pipe(get_book_counts) \
    .pipe(get_n_top_values, sample_size)


def get_sampled_pivot_table(data):
  top_book_ids = get_top_books_ids(data)
  book_sampled_relationship_data = data[data["ISBN"].isin(top_book_ids)] \
    .pipe(format_sample) \
    .pivot_table(index="user", columns="book", values="action").fillna(0)
  return book_sampled_relationship_data


Writing data.py


## Download dos dados

In [2]:
%env KAGGLE_USERNAME=ricoms
%env KAGGLE_KEY=8bd3a4b719368399d9965b8cdf83f8d9

!kaggle datasets download -d arashnic/book-recommendation-dataset --unzip -p /content/book-recommendation-dataset
!ls /content/book-recommendation-dataset

env: KAGGLE_USERNAME=ricoms
env: KAGGLE_KEY=8bd3a4b719368399d9965b8cdf83f8d9
401 - Unauthorized


'ls' nÆo ‚ reconhecido como um comando interno
ou externo, um programa oper vel ou um arquivo em lotes.


## Testando matrizes esparsas

In [3]:
from pathlib import Path

from data import get_data

DATA_PATH = Path("/content/book-recommendation-dataset")

ratings_df_path = DATA_PATH / 'Ratings.csv'

data = get_data(ratings_df_path)


def print_memory_usage_of_data_frame(df, bytes_to_mb_div = 0.000001):
    mem = round(df.memory_usage().sum() * bytes_to_mb_div, 3) 
    print("Memory usage is " + str(mem) + " MB")

print_memory_usage_of_data_frame(data)


FileNotFoundError: [Errno 2] No such file or directory: '\\content\\book-recommendation-dataset\\Ratings.csv'

In [None]:
%%time

from data import get_sampled_pivot_table

data_one_ho_non_sparse = get_sampled_pivot_table(data)
display(data_one_ho_non_sparse.head())
display(data_one_ho_non_sparse.shape)
print_memory_usage_of_data_frame(data_one_ho_non_sparse)

book,000649840X,002542730X,0060008032,0060096195,006016848X,0060173289,0060175400,0060188731,006019491X,0060199652,0060391626,0060392452,0060502258,0060512822,0060740450,0060915544,0060916508,0060921145,0060922532,0060926317,0060928336,0060929871,006092988X,0060930535,0060932759,0060934417,0060938455,0060958022,0060959037,0060964049,0060976845,0060977493,0060987103,0060987529,0060987561,006099486X,0061000043,0061000175,0061009059,006101351X,...,081297106X,0842329129,0842329218,0842329242,0842329250,0842329269,0842342702,0871136791,0887307876,0894805770,089480829X,0971880107,140003065X,1400031346,1400031354,1400031362,1400032717,1400034779,155166674X,1551668998,1558531025,155874262X,1558743669,1558744150,1558744630,1558745157,1558745718,1559029838,1565122968,1573221937,1573225517,1573225789,1573227331,1573229326,1573229571,1573229725,1576737330,1592400876,1878424319,8873122933
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


(35619, 1000)

Memory usage is 285.237 MB
CPU times: user 34.1 s, sys: 3.75 s, total: 37.8 s
Wall time: 31.5 s


In [None]:
%%time

import pandas as pd

data_one_hot = pd.get_dummies(data, columns=['User-ID', 'ISBN'], sparse=True)
display(data_one_hot.head())
display(data_one_hot.shape)
print_memory_usage_of_data_frame(data_one_hot)

Unnamed: 0,Book-Rating,User-ID_2,User-ID_7,User-ID_8,User-ID_9,User-ID_10,User-ID_12,User-ID_14,User-ID_16,User-ID_17,User-ID_19,User-ID_20,User-ID_22,User-ID_23,User-ID_26,User-ID_32,User-ID_36,User-ID_38,User-ID_39,User-ID_42,User-ID_44,User-ID_51,User-ID_53,User-ID_56,User-ID_64,User-ID_67,User-ID_68,User-ID_69,User-ID_70,User-ID_73,User-ID_75,User-ID_77,User-ID_78,User-ID_79,User-ID_81,User-ID_82,User-ID_83,User-ID_85,User-ID_86,User-ID_87,...,ISBN_THECATASTROPH,ISBN_THEFLYINGACE,ISBN_TINACRUZ1001,ISBN_TM674623,ISBN_ULLSTEINBUCHN,ISBN_UNGRANDHOMMED,ISBN_V16382000,ISBN_VENAFRO001,ISBN_VG3862004,ISBN_WEAREWITNESSE,ISBN_X000000000,ISBN_X113780760,ISBN_X439361760,ISBN_X903145730,ISBN_XXXXXXXXXX,ISBN_XXXXXXXXXXXXX,ISBN_Y99697115,"ISBN_YOUTELLEM,AND",ISBN_Z380703475,ISBN_ZR902CX0093,ISBN_ZR903CX0003,"ISBN_\0094749809""","ISBN_\0210000010""","ISBN_\0432534220""","ISBN_\0432534220\""""","ISBN_\0679751521""","ISBN_\2842053052\""""","ISBN_\8804501367\""""","ISBN_\8804520159\""""","ISBN_\8887517452\""""","ISBN_\8888809228\""""","ISBN_\9170010242\""""",ISBN_`3502103682,ISBN_b00005wz75,ISBN_cn108465,ISBN_cn113107,ISBN_ooo7156103,ISBN_§423350229,ISBN_´3499128624,ISBN_Ô½crosoft
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


(1149780, 445840)

Memory usage is 20.696 MB
CPU times: user 5min 54s, sys: 11.3 s, total: 6min 6s
Wall time: 6min 2s


In [None]:
import pandas as pd 