In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import duckdb
import polars as pl

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
source_fol = Path('parsed_files')
reviews_fol = Path(source_fol, 'reviews')
meta_fol = Path(source_fol, 'meta')

In [10]:
all_files = set(Path.rglob(source_fol, '*.parquet'))
meta_files = set(Path.rglob(source_fol, 'meta*.parquet'))
review_files = all_files.difference(meta_files)

In [5]:
db_file = 'e_com_samples.duckdb'
duck = duckdb.connect(db_file)

## Создаём таблицу продуктов

In [11]:
mfiles = list(meta_files)[1:]+[list(meta_files)[0]]
mfiles

[WindowsPath('parsed_files/meta/meta_Grocery_and_Gourmet_Food.parquet'),
 WindowsPath('parsed_files/meta/meta_Magazine_Subscriptions.parquet'),
 WindowsPath('parsed_files/meta/meta_Industrial_and_Scientific.parquet'),
 WindowsPath('parsed_files/meta/meta_Musical_Instruments.parquet'),
 WindowsPath('parsed_files/meta/meta_Handmade_Products.parquet'),
 WindowsPath('parsed_files/meta/meta_art.parquet'),
 WindowsPath('parsed_files/meta/meta_Amazon_Fashion.parquet'),
 WindowsPath('parsed_files/meta/meta_Software.parquet'),
 WindowsPath('parsed_files/meta/meta_Baby_Products.parquet')]

## Создаём выборку продуктов

In [19]:
products_sample = (
    duck.query(f"""
        select
            main_category
            , title
            , parent_asin
        from parquet_scan({[str(x) for x in mfiles]}, hive_partitioning=True)
    """).to_df()
    .groupby(['main_category'], as_index=False).sample(frac=.2)
)

In [21]:
products_sample.query("parent_asin.duplicated(keep=False)")

Unnamed: 0,main_category,title,parent_asin


In [20]:
products_sample

Unnamed: 0,main_category,title,parent_asin
2568457,AMAZON FASHION,HENGAO Men's Looks Dirty Straight Slim Fit Mot...,B07DMJ6G9K
2402423,AMAZON FASHION,6 Pairs Leather Teardrop Earrings Lightweight ...,B07RSY81H1
2922494,AMAZON FASHION,ZEFFFKA 6-Pack Women’s Comfort Casual Low-Cut ...,B0886H2RK8
2304458,AMAZON FASHION,Statement Drop Earrings Raffia Handmade Earrin...,B07R2BHRJX
2599374,AMAZON FASHION,DRESSTELLS Women's Vintage Rockabilly Net Pett...,B015MZVAMY
...,...,...,...
1352492,Video Games,THRONMAX Fireball Cardioid USB Microphone|Podc...,B08J8MFZK1
1344185,Video Games,LIPOVOLT® SET OF Gold Sealed Humbucker Pickup ...,B01NBROYZM
489857,Video Games,Eyewitness DVD: Human Machine,0756645042
2190443,Video Games,Mobile Suit Gundam accessories set for PlaySta...,B018HORBLG


### Создаём таблицу items

In [22]:
duck.query("""
    create table items(
        item_id bigint
        , parent_asin varchar
        , title varchar
        , image varchar
        , primary key (item_id)
    )
""")

In [23]:
duck.query(f"""
    create sequence if not exists seq_1;
    insert into items
    with
    meta as (
        select
           parent_asin
           , title
           , images
        from parquet_scan({[str(x) for x in mfiles]}, hive_partitioning=True)
        where
            parent_asin in (select parent_asin from products_sample)
    )
    select
        nextval('seq_1') as item_id
        , parent_asin
        , m.title
        , m.images as image
    from meta m
""")

In [24]:
duck.query("""
    select * from items
""")

┌─────────┬─────────────┬────────────────────────────────────────┬─────────────────────────────────────────────────────┐
│ item_id │ parent_asin │                 title                  │                        image                        │
│  int64  │   varchar   │                varchar                 │                       varchar                       │
├─────────┼─────────────┼────────────────────────────────────────┼─────────────────────────────────────────────────────┤
│     679 │ B081734SNF  │ Epstein Didn't Kill Himself Holiday …  │ https://m.media-amazon.com/images/I/61EexBYKY2L.jpg │
│     680 │ B0B1ZJTL4B  │ Meissa Sterling Silver Ring for Wome…  │ https://m.media-amazon.com/images/I/41RLU-8-pFL.jpg │
│     681 │ B09BLKBW26  │ Zoe Pendant Necklace Mom Gift Person…  │ https://m.media-amazon.com/images/I/31wqWE-Gv1L.jpg │
│     682 │ B07DFR8LBJ  │ Football Wreath - Sports Wreaths for…  │ https://m.media-amazon.com/images/I/51VlezKxNfL.jpg │
│     683 │ B07MFW4517  │ Patcho

In [25]:
duck.query("select count(parent_asin), count(distinct parent_asin) from items")

┌────────────────────┬─────────────────────────────┐
│ count(parent_asin) │ count(DISTINCT parent_asin) │
│       int64        │            int64            │
├────────────────────┼─────────────────────────────┤
│             643123 │                      643123 │
└────────────────────┴─────────────────────────────┘

### Создаём таблицу stores

In [26]:
duck.query("""
    create table stores(
        store_id bigint
        , store varchar
        , primary key (store_id)
    )
""")

In [27]:
duck.query(f"""
    create sequence if not exists seq_1;
    insert into stores
    with
    store_list as (
        select distinct store
        from parquet_scan({[str(x) for x in mfiles]}, hive_partitioning=True) 
        where
            parent_asin in (select parent_asin from products_sample)
    )
    select
        nextval('seq_1') as store_id
        , store
    from store_list
""")

In [28]:
duck.query("select count(store), count(distinct store) from stores")

┌──────────────┬───────────────────────┐
│ count(store) │ count(DISTINCT store) │
│    int64     │         int64         │
├──────────────┼───────────────────────┤
│       161477 │                161477 │
└──────────────┴───────────────────────┘

### Создаём таблицу categories

In [29]:
duck.query("""
    create table categories(
        category_id bigint
        , category varchar
        , primary key (category_id)
    )
""")

In [30]:
duck.query(f"""
    create sequence if not exists seq_1;
    insert into categories
    with
    category_list as (
        select distinct main_category as category
        from parquet_scan({[str(x) for x in mfiles]}, hive_partitioning=True) 
        where
            parent_asin in (select parent_asin from products_sample)
    )
    select
        nextval('seq_1') as category_id
        , category
    from category_list
""")

In [31]:
duck.query("select count(category), count(distinct category) from categories")

┌─────────────────┬──────────────────────────┐
│ count(category) │ count(DISTINCT category) │
│      int64      │          int64           │
├─────────────────┼──────────────────────────┤
│              42 │                       42 │
└─────────────────┴──────────────────────────┘

### Создаём таблицу items_info

In [32]:
duck.query("""
    create table if not exists items_info(
        item_id bigint
        , category_id bigint
        , store_id bigint
        , price double
        , has_image bigint
        , has_video bigint
        , foreign key (item_id) references items(item_id)
--        , foreign key (category_id) references categories(category_id)
--        , foreign key (store_id) references stores(store_id)
    )
""")

In [33]:
info = duck.query(f"""
    with
    meta as (
        select
           parent_asin
           , main_category as category
           , store
           , price
           , has_image
           , videos as has_video
        from parquet_scan({[str(x) for x in mfiles]}, hive_partitioning=True)
        where
            parent_asin in (select parent_asin from products_sample)
    )
    select
        i.item_id
        , c.category_id
        , s.store_id
        , m.price
        , m.has_image
        , m.has_video
    from meta m
        left join items i on i.parent_asin = m.parent_asin
        left join categories c on c.category = m.category
        left join stores s on s.store = m.store 
""").pl()

In [34]:
info.estimated_size("mb")

29.593168258666992

In [35]:
duck.query("""
    insert into items_info
    select *
    from info
""")

### Создаём матрицу наиболее часто встречающихся слов в текстовых метаданных

In [12]:
duck.query(f"""
    create table meta as
    select 
        i.item_id
        , t.categories
        , t.features
        , t.description
        , t.details 
    from parquet_scan({[str(x) for x in mfiles]}, hive_partitioning=True) t
        inner join items i on t.parent_asin=i.parent_asin
""")

In [36]:
vectorizer = TfidfVectorizer(min_df=.1, max_df=.8)

In [37]:
meta = duck.query(f"""
    select 
        i.item_id
        , concat_ws(';', t.categories, t.features, t.description, t.details) as txt 
    from parquet_scan({[str(x) for x in mfiles]}, hive_partitioning=True) t
        inner join items i on t.parent_asin=i.parent_asin
""").pl()

In [38]:
meta.estimated_size('mb')

521.010910987854

In [39]:
vectorized_data = vectorizer.fit_transform(
    meta.to_pandas()['txt'].fillna('')
)

In [40]:
tfidf_df=pd.DataFrame(
    vectorized_data.toarray(),
    columns = vectorizer.get_feature_names_out(),
    index = meta.select('item_id').to_series().to_numpy()
).reset_index(names='item_id')

In [41]:
del vectorized_data
del meta

In [42]:
duck.query("""
    create table tfidf as
    select *
    from tfidf_df
""")

## Создаём таблицу users

In [43]:
duck.query("""
    create table if not exists users(
        user_id bigint
        , user varchar
        , primary key (user_id)
    )
""")

In [44]:
duck.query(f"""
    create sequence if not exists seq_1;
    insert into users
    with
    u as (
        select distinct
           user_id as user
        from parquet_scan({[str(x) for x in review_files]}, hive_partitioning=True)
        where
            parent_asin in (select parent_asin from products_sample)
    )
    select
        nextval('seq_1') as user_id
        , user
    from u
""")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [45]:
duck.query("select count(user), count(distinct user) from users")

┌───────────────┬────────────────────────┐
│ count("user") │ count(DISTINCT "user") │
│     int64     │         int64          │
├───────────────┼────────────────────────┤
│       6123376 │                6123376 │
└───────────────┴────────────────────────┘

### Создание таблицы reviews

In [46]:
duck.query("""
    create table if not exists reviews (
        user_id bigint
        , item_id bigint
        , review_timestamp timestamp
        , rating double
        , helpful_vote bigint
        , verified_purchase boolean
        , foreign key (user_id) references users(user_id)
        --, foreign key (item_id) references items(item_id)
    )
""")

In [47]:
reviews_df = duck.query(f"""
    with
    r as (
        select
            user_id as user
            , parent_asin
            , epoch_ms(timestamp) as review_timestamp
            , rating
            , helpful_vote
            , verified_purchase
        from parquet_scan({[str(x) for x in review_files]}, hive_partitioning=True)
        where
            parent_asin in (select parent_asin from products_sample)
    )
    select
        u.user_id
        , i.item_id
        , review_timestamp
        , rating
        , helpful_vote
        , verified_purchase
    from r
        left join users u on r.user=u.user
        left join items i on r.parent_asin=i.parent_asin
""").pl()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [48]:
reviews_df.estimated_size("mb")

348.7381992340088

In [49]:
duck.query(f"""
    insert into reviews
    select
        user_id
        , item_id
        , review_timestamp
        , rating
        , helpful_vote
        , verified_purchase
    from reviews_df
""")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [50]:
duck.query("select * from reviews")

┌─────────┬─────────┬─────────────────────────┬────────┬──────────────┬───────────────────┐
│ user_id │ item_id │    review_timestamp     │ rating │ helpful_vote │ verified_purchase │
│  int64  │  int64  │        timestamp        │ double │    int64     │      boolean      │
├─────────┼─────────┼─────────────────────────┼────────┼──────────────┼───────────────────┤
│ 5902887 │   33286 │ 2022-11-20 11:40:49.07  │    5.0 │            3 │ true              │
│ 5902887 │   94515 │ 2022-11-20 11:36:13.763 │    5.0 │            0 │ true              │
│ 3125335 │   49955 │ 2020-07-20 05:36:05.496 │    5.0 │            0 │ true              │
│ 2589724 │  103627 │ 2018-10-08 02:19:23.123 │    5.0 │           56 │ true              │
│ 3222923 │  111660 │ 2022-06-05 23:16:30.286 │    5.0 │            1 │ true              │
│ 3222923 │   59394 │ 2017-02-06 04:06:04     │    3.0 │            0 │ true              │
│ 3222923 │  117090 │ 2014-03-24 12:57:14     │    3.0 │            0 │ false   

In [51]:
duck.query("show tables")

┌────────────┐
│    name    │
│  varchar   │
├────────────┤
│ categories │
│ items      │
│ items_info │
│ reviews    │
│ stores     │
│ tfidf      │
│ users      │
└────────────┘

In [52]:
duck.query("select * from users")

┌─────────┬──────────────────────────────┐
│ user_id │             user             │
│  int64  │           varchar            │
├─────────┼──────────────────────────────┤
│  939796 │ AG56OBEZPGB6YIRT62XAWELAVGEQ │
│  939797 │ AFED3JM4OIBL5VNAJNRYCEUTSNOA │
│  939798 │ AECQTTS3VETRQHG4JBEX6ZSWU4KQ │
│  939799 │ AEHO6QXUGNJ2UE4WD7DV7UP4HKCQ │
│  939800 │ AHEOXRDBZB6PZZRCXXMBXVRR7ZTQ │
│  939801 │ AH7DFKIRWIBGLJ2TXK4VBSFQPLFQ │
│  939802 │ AHI4GJG4EPRJFHLEWL2DBVVAKOTQ │
│  939803 │ AFMGEZWJU2NJEPTNCTHG3732S62A │
│  939804 │ AG7DVK5PSLSKAG6ODWX432NJ2KGA │
│  939805 │ AF2LOWPJKXPBY5FJG2SGIBLPF3DQ │
│     ·   │              ·               │
│     ·   │              ·               │
│     ·   │              ·               │
│ 1063971 │ AH5GML4UXVCDLHFM4K4UWOFUJJVA │
│ 1063972 │ AF7SBDISIYAFIWU7FLRILQTN5WKA │
│ 1063973 │ AFSL7SB4BO33CQO6LHQUPVDNGO4A │
│ 1063974 │ AHJU5TFSSRQHV6PUK6BM6P3JPTAQ │
│ 1063975 │ AE2EK4POH4NJC6GYSXMNWWJU7NYQ │
│ 1063976 │ AGXWDO5ZY3JEAYTI4NGY6BLDV6MA │
│ 1063977 │

In [13]:
duck.close()