In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import duckdb
import polars as pl

In [5]:
pd.to_datetime('2024-09-23') + pd.Timedelta(days = 1)

Timestamp('2024-09-24 00:00:00')

In [2]:
source_fol = Path('parsed_files')
reviews_fol = Path(source_fol, 'reviews')
meta_fol = Path(source_fol, 'meta')

In [3]:
all_files = set(Path.rglob(source_fol, '*.parquet'))
meta_files = set(Path.rglob(source_fol, 'meta*.parquet'))
review_files = all_files.difference(meta_files)

In [4]:
review_files

{WindowsPath('parsed_files/reviews/Amazon_Fashion.parquet'),
 WindowsPath('parsed_files/reviews/Arts_Crafts_and_Sewing.parquet'),
 WindowsPath('parsed_files/reviews/Baby_Products.parquet'),
 WindowsPath('parsed_files/reviews/Grocery_and_Gourmet_Food.parquet'),
 WindowsPath('parsed_files/reviews/Handmade_Products.parquet'),
 WindowsPath('parsed_files/reviews/Industrial_and_Scientific.parquet'),
 WindowsPath('parsed_files/reviews/Magazine_Subscriptions.parquet'),
 WindowsPath('parsed_files/reviews/Musical_Instruments.parquet'),
 WindowsPath('parsed_files/reviews/Software.parquet')}

In [5]:
meta_files

{WindowsPath('parsed_files/meta/meta_Amazon_Fashion.parquet'),
 WindowsPath('parsed_files/meta/meta_art.parquet'),
 WindowsPath('parsed_files/meta/meta_Baby_Products.parquet'),
 WindowsPath('parsed_files/meta/meta_Grocery_and_Gourmet_Food.parquet'),
 WindowsPath('parsed_files/meta/meta_Handmade_Products.parquet'),
 WindowsPath('parsed_files/meta/meta_Industrial_and_Scientific.parquet'),
 WindowsPath('parsed_files/meta/meta_Magazine_Subscriptions.parquet'),
 WindowsPath('parsed_files/meta/meta_Musical_Instruments.parquet'),
 WindowsPath('parsed_files/meta/meta_Software.parquet')}

In [6]:
db_file = 'ecom_db.duckdb'
duck = duckdb.connect(db_file)

In [9]:
duck.query(f"""
    create table reviews as
    select
        rating
        , parent_asin
        , user_id
        , epoch_ms(timestamp) as review_timestamp
        , helpful_vote
        , verified_purchase
    from parquet_scan({[str(x) for x in list(review_files)]}, hive_partitioning=True)
""")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [34]:
duck.query("select count(*) from reviews")# order by helpful_vote desc")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│     45631385 │
└──────────────┘

In [42]:
duck.query("alter table reveiws_net rename to reviews_net")

In [38]:
duck.query("""
    create table reveiws_net as
    with
    base as (
    select 
        r.*
        , strftime(review_timestamp, '%Y%m%d') as review_date
        , row_number() over(
            partition by user_id, parent_asin, review_date
           order by review_timestamp
        ) as row_n
    from reviews r
    qualify row_n=1
    order by
        user_id
        , parent_asin
        , review_timestamp
    )
    select * exclude(row_n, review_date)
    from base
""")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [53]:
users_nunique=duck.query("select count(distinct user_id) from reviews_net").fetchall()[0][0]
users_nunique

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

17776687

In [64]:
duck.query(f"""
    create sequence seq_id start 1;
    with
    base as (
        select
           distinct user_id
        from reviews_net
    )
    select
        nextval('seq_id') as id
        , user_id
    from base
    limit 5
""")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

┌───────┬──────────────────────────────┐
│  id   │           user_id            │
│ int64 │           varchar            │
├───────┼──────────────────────────────┤
│     1 │ AE5JCP242E67D6OOUIPQPYW7QH3A │
│     2 │ AE5JD3IOUTOPD4UX4PAVNRCPVEMQ │
│     3 │ AE5JDAQ4WBSAKPOFJNHSASZV4VVQ │
│     4 │ AE5JDBITA7BHP2IGKS5PJSTIPUPA │
│     5 │ AE5JDFPLO7EJT6OGVT5VRXE4G4KA │
└───────┴──────────────────────────────┘

In [68]:
duck.query("drop sequence seq_id")

In [69]:
duck.query(f"""
    create sequence seq_id start 1;
    create table users as
    with
    base as (
        select
           distinct user_id
        from reviews_net
    )
    select
        nextval('seq_id') as id
        , user_id
    from base
""")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [70]:
duck.query("select * from users").pl().estimated_size('mb')

1085.0031814575195

In [74]:
duck.query("drop table reviews")

In [80]:
duck.query("""
    create table reviews as
    select
        u.id as user_id
        , rn.parent_asin
        , rn.review_timestamp
        , rn.rating
        , rn.helpful_vote
        , rn.verified_purchase
    from reviews_net rn
        left join users u on rn.user_id=u.user_id
    order by review_timestamp
""")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [46]:
duck.query("""
    select *
    from reviews_net
""").pl().estimated_size('mb')

3876.1126098632812

In [83]:
duck.query("""
    select *
    from reviews
""").pl().estimated_size('mb')

1811.6885948181152

## Создаём таблицу продуктов

In [12]:
meta_files

{WindowsPath('parsed_files/meta/meta_Amazon_Fashion.parquet'),
 WindowsPath('parsed_files/meta/meta_art.parquet'),
 WindowsPath('parsed_files/meta/meta_Baby_Products.parquet'),
 WindowsPath('parsed_files/meta/meta_Grocery_and_Gourmet_Food.parquet'),
 WindowsPath('parsed_files/meta/meta_Handmade_Products.parquet'),
 WindowsPath('parsed_files/meta/meta_Industrial_and_Scientific.parquet'),
 WindowsPath('parsed_files/meta/meta_Magazine_Subscriptions.parquet'),
 WindowsPath('parsed_files/meta/meta_Musical_Instruments.parquet'),
 WindowsPath('parsed_files/meta/meta_Software.parquet')}

In [36]:
mfiles = list(meta_files)[1:]+[list(meta_files)[0]]
mfiles

[WindowsPath('parsed_files/meta/meta_Baby_Products.parquet'),
 WindowsPath('parsed_files/meta/meta_Software.parquet'),
 WindowsPath('parsed_files/meta/meta_Industrial_and_Scientific.parquet'),
 WindowsPath('parsed_files/meta/meta_Grocery_and_Gourmet_Food.parquet'),
 WindowsPath('parsed_files/meta/meta_Handmade_Products.parquet'),
 WindowsPath('parsed_files/meta/meta_art.parquet'),
 WindowsPath('parsed_files/meta/meta_Musical_Instruments.parquet'),
 WindowsPath('parsed_files/meta/meta_Magazine_Subscriptions.parquet'),
 WindowsPath('parsed_files/meta/meta_Amazon_Fashion.parquet')]

In [37]:
duck.query(f"""
    create table raw_items as
    select *
    from parquet_scan({[str(x) for x in mfiles]}, hive_partitioning=True)
""")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [39]:
duck.query("""
    select *
    from raw_items
    limit 100
""").show(max_width=500)

┌──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬────────┬──────────────────────┬────────┬──────────────────────┬──────────────────────┬─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬─────────────┬───────────┐
│    main_category     │        title         │       features       │     description      │ price  │        images        │ videos │        store         │      categories      │                                                                                                                                               details                                                                                                                                               │ parent_asin │ has_image 

In [41]:
duck.query("""
    select
        count(*)
        , count(distinct parent_asin)
    from raw_items
""")

┌──────────────┬─────────────────────────────┐
│ count_star() │ count(DISTINCT parent_asin) │
│    int64     │            int64            │
├──────────────┼─────────────────────────────┤
│      3347168 │                     3347168 │
└──────────────┴─────────────────────────────┘

In [58]:
duck.query(f"""
    drop sequence seq_id;
    create sequence seq_id start 1;
    create table products as
    with
    base as (
        select distinct 
           parent_asin
           , title
           , images
        from raw_items
    )
    select
        nextval('seq_id') as product_id
        , parent_asin
        , title
        , images
    from base
""")

In [50]:
duck.query(f"""
    drop sequence seq_id;
    create sequence seq_id start 1;
    create table stores as
    with
    base as (
        select distinct store
        from raw_items
    )
    select
        nextval('seq_id') as store_id
        , store
    from base
""")

In [55]:
duck.query(f"""
    drop sequence seq_id;
    create sequence seq_id start 1;
    create table categories as
    with
    base as (
        select distinct main_category
        from raw_items
    )
    select
        nextval('seq_id') as category_id
        , main_category
    from base
""")

In [44]:
duck.query("select * from raw_items limit 1").to_df().columns

Index(['main_category', 'title', 'features', 'description', 'price', 'images',
       'videos', 'store', 'categories', 'details', 'parent_asin', 'has_image'],
      dtype='object')

In [None]:
['main_category', 'title', 'features', 'description', 'price', 'images',
       'videos', 'store', 'categories', 'details', 'parent_asin', 'has_image']

In [None]:
['videos', 'details', 'parent_asin', 'has_image']

In [9]:
duck.query("""
    --create table descriptions as
    select
        p.product_id
        , c.category_id
        , ri.description
    from raw_items ri
        left join products p on ri.parent_asin=ri.parent_asin
        --left join stores s on ri.store=s.store
        left join categories c on ri.main_category=c.main_category
""")

┌────────────┬─────────────┬───────────────────────────────────────────────────────────────────────────────────────────┐
│ product_id │ category_id │                                        description                                        │
│   int64    │    int64    │                                          varchar                                          │
├────────────┼─────────────┼───────────────────────────────────────────────────────────────────────────────────────────┤
│       4097 │          35 │ NULL                                                                                      │
│       4097 │           1 │ Great tasting CarbSport is a complete sports drink that helps prevent muscle cramps whi…  │
│       4097 │          35 │ Add delicious flavor to everything you bake this holiday.                                 │
│       4097 │          35 │ As we have all seen and heard, Sriracha is the "it" hot sauce! And, it is great! It is …  │
│       4097 │           1 │ NUL

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
vectorizer = TfidfVectorizer(min_df=.1, max_df=.8)

In [9]:
vectorized_data = vectorizer.fit_transform(
    duck.query("""
        select concat_ws(';', categories, features, description, details) as txt 
        from raw_items
    """).to_df()['txt'].fillna('')
)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [10]:
vectorizer.get_feature_names_out()

array(['10', '100', '11', '12', '13', '14', '15', 'all', 'also', 'an',
       'and', 'any', 'are', 'arts', 'as', 'at', 'available', 'be', 'best',
       'brand', 'by', 'can', 'color', 'crafts', 'date', 'design',
       'dimensions', 'discontinued', 'each', 'easy', 'first', 'for',
       'free', 'from', 'great', 'hand', 'has', 'have', 'high', 'in',
       'inches', 'included', 'is', 'it', 'item', 'items', 'made', 'make',
       'manufacturer', 'material', 'may', 'model', 'more', 'no', 'not',
       'number', 'of', 'on', 'one', 'only', 'or', 'other', 'ounces',
       'our', 'package', 'perfect', 'pieces', 'pounds', 'product',
       'products', 'quality', 'range', 'rank', 'required', 'sellers',
       'sewing', 'size', 'style', 'supplies', 'that', 'the', 'these',
       'this', 'time', 'to', 'type', 'up', 'upc', 'use', 'used', 'we',
       'weight', 'will', 'with', 'you', 'your'], dtype=object)

In [11]:
tfidf_df=pd.DataFrame(
    vectorized_data.toarray(),
    columns = vectorizer.get_feature_names_out()
)
tfidf_df

Unnamed: 0,10,100,11,12,13,14,15,all,also,an,...,up,upc,use,used,we,weight,will,with,you,your
0,0.0,0.000000,0.000000,0.000000,0.063669,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.063094,0.0,0.000000,0.000000,0.000000,0.187570,0.000000,0.257043,0.000000,0.000000
1,0.0,0.000000,0.000000,0.266361,0.078415,0.000000,0.000000,0.128667,0.000000,0.000000,...,0.466244,0.0,0.064387,0.000000,0.000000,0.184810,0.000000,0.000000,0.111681,0.109124
2,0.0,0.072317,0.000000,0.000000,0.000000,0.000000,0.000000,0.062288,0.303192,0.140759,...,0.000000,0.0,0.062340,0.000000,0.146851,0.044734,0.000000,0.087575,0.162197,0.211311
3,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.130995,0.000000,...,0.000000,0.0,0.215473,0.000000,0.000000,0.077309,0.000000,0.075674,0.000000,0.182595
4,0.0,0.000000,0.000000,0.000000,0.000000,0.150672,0.000000,0.079568,0.000000,0.000000,...,0.000000,0.0,0.079634,0.048616,0.046898,0.057144,0.000000,0.055935,0.172661,0.101225
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3347163,0.0,0.000000,0.670250,0.000000,0.000000,0.000000,0.367252,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3347164,0.0,0.086275,0.083437,0.076919,0.000000,0.000000,0.000000,0.074311,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.154733,0.261199,0.129003,0.126049
3347165,0.0,0.000000,0.000000,0.393842,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3347166,0.0,0.467300,0.000000,0.000000,0.490602,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [12]:
tfidf_df.index = duck.query("select p.product_id from raw_items ri left join products p on ri.parent_asin=p.parent_asin").to_df()['product_id']

In [13]:
tfidf_df.head(20)

Unnamed: 0_level_0,10,100,11,12,13,14,15,all,also,an,...,up,upc,use,used,we,weight,will,with,you,your
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
120522,0.0,0.0,0.0,0.0,0.063669,0.0,0.0,0.0,0.0,0.0,...,0.063094,0.0,0.0,0.0,0.0,0.18757,0.0,0.257043,0.0,0.0
14337,0.0,0.0,0.0,0.266361,0.078415,0.0,0.0,0.128667,0.0,0.0,...,0.466244,0.0,0.064387,0.0,0.0,0.18481,0.0,0.0,0.111681,0.109124
106739,0.0,0.072317,0.0,0.0,0.0,0.0,0.0,0.062288,0.303192,0.140759,...,0.0,0.0,0.06234,0.0,0.146851,0.044734,0.0,0.087575,0.162197,0.211311
54062,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.130995,0.0,...,0.0,0.0,0.215473,0.0,0.0,0.077309,0.0,0.075674,0.0,0.182595
71133,0.0,0.0,0.0,0.0,0.0,0.150672,0.0,0.079568,0.0,0.0,...,0.0,0.0,0.079634,0.048616,0.046898,0.057144,0.0,0.055935,0.172661,0.101225
14338,0.0,0.0,0.0,0.0,0.0,0.0,0.383627,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.135311,0.0
71135,0.0,0.443795,0.0,0.098916,0.116481,0.120641,0.0,0.0,0.0,0.0,...,0.0,0.0,0.095643,0.116779,0.11265,0.068631,0.099492,0.067179,0.082948,0.162098
54063,0.0,0.0,0.58017,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
120523,0.0,0.0,0.0,0.097089,0.0,0.0,0.0,0.093798,0.114142,0.0,...,0.0,0.0,0.0,0.0,0.110569,0.067363,0.097654,0.065939,0.162832,0.0
2355662,0.0,0.325268,0.0,0.0,0.170744,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.070099,0.0,0.247692,0.050301,0.07292,0.049237,0.121589,0.059403


In [14]:
tfidf_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3347168 entries, 120522 to 1987965
Data columns (total 96 columns):
 #   Column        Dtype  
---  ------        -----  
 0   10            float64
 1   100           float64
 2   11            float64
 3   12            float64
 4   13            float64
 5   14            float64
 6   15            float64
 7   all           float64
 8   also          float64
 9   an            float64
 10  and           float64
 11  any           float64
 12  are           float64
 13  arts          float64
 14  as            float64
 15  at            float64
 16  available     float64
 17  be            float64
 18  best          float64
 19  brand         float64
 20  by            float64
 21  can           float64
 22  color         float64
 23  crafts        float64
 24  date          float64
 25  design        float64
 26  dimensions    float64
 27  discontinued  float64
 28  each          float64
 29  easy          float64
 30  first         floa

In [7]:
duck.query("""
    select description
    from raw_items
""")

┌──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│                                                     description                                                      │
│                                                       varchar                                                        │
├──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤
│ Product Description;For ultimate convenience, the Chicco Viaro Quick-Fold Stroller has a sleek three-wheel design,…  │
│ You can choose bigger size If you confuse about size.;Medium: Age (6-12 months), Height (up to 32inches), Weight (…  │
│ NULL                                                                                                                 │
│ When babies begin to show interest in feeding themselves, having the right equipment is a must! Nuby feeding acces…  │
│ The mDesign clear storage bins

In [12]:
duck.query("""
    --create table meta as
    select
        p.product_id
        , ri.store
        , ri.main_category
        , ri.categories as subcategories
        , ri.features
        , ri.description
        , ri.details
        , ri.price
        , ri.has_image
        , ri.videos as has_video
    from raw_items ri
        left join products p on ri.parent_asin=ri.parent_asin
        left join stores s on ri.store=s.store
        left join categories c on ri.main_category=c.main_category
""").show(max_width=200)#.pl().estimated_size()

┌────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬─────────────────────────────────────┬────────┬───────────┬───────────┐
│ product_id │        store         │    main_category     │    subcategories     │       features       │     description      │               details               │ price  │ has_image │ has_video │
│   int64    │       varchar        │       varchar        │       varchar        │       varchar        │       varchar        │               varchar               │ double │   int64   │   int64   │
├────────────┼──────────────────────┼──────────────────────┼──────────────────────┼──────────────────────┼──────────────────────┼─────────────────────────────────────┼────────┼───────────┼───────────┤
│       4097 │ mAppsguru            │ Appstore for Android │ NULL                 │ All the pressing p…  │ Acupressure techni…  │ Release Date:2015;Date first list…  │    0.0 │         1 │        

In [8]:
duck.query("select * from products")

┌────────────┬─────────────┬────────────────────────────────┬──────────────────────────────────────────────────────────┐
│ product_id │ parent_asin │             title              │                          images                          │
│   int64    │   varchar   │            varchar             │                         varchar                          │
├────────────┼─────────────┼────────────────────────────────┼──────────────────────────────────────────────────────────┤
│       8193 │ B07X3Q2M38  │ Korean Dongwon Yangban Seaso…  │ https://m.media-amazon.com/images/I/61VU+bCdokL.jpg      │
│       8194 │ B07FYFBFDF  │ PRODUCE Organic Strawberries…  │ https://m.media-amazon.com/images/I/41Ks9wyeEyL.jpg      │
│       8195 │ B09P9NDRSY  │ US-FARMERS Natural Premium Q…  │ https://m.media-amazon.com/images/I/51TL-VBTrNL.jpg      │
│       8196 │ B07L1XNR2Y  │ Poulain Cacao Grand Arome 800g │ https://m.media-amazon.com/images/I/21WhGtZomXL.jpg      │
│       8197 │ B08XRJSFH5  │ Wil

In [None]:
duck.query("""
    select
        parent_asin
        , count(*) as cnt
    from items
    group by parent_asin
    order by cnt desc
""")

In [None]:
reviews = pl.scan_parquet(review_files)#.collect()

In [None]:
users_cnt = reviews.select('user_id').collect().n_unique()
users_cnt
#pl.Series(name='id', values=np.arange(1, 10))

In [None]:
reviews.with_columns(
    pl.from_numpy(np.arange(1, pl.col('user_id').n_unique()))
)

In [None]:
users = reviews.select(pl.col('user_id')).unique().collect().to_pandas().reset_index()

In [None]:
reviews

In [None]:
reviews['user_id'].n_unique()

In [None]:
reviews['user_id'].to_pandas().reset_index()

In [None]:
del reviews

In [None]:
items = pd.concat(list(map(
    lambda x: pd.read_parquet(x),
    meta_files
)), ignore_index=True)

In [None]:
reviews.estimated_size('gb')

In [None]:
reviews