In [1]:
import sqlite3
import pandas as pd
from sklearn.model_selection import train_test_split

# Establish connection to the SQLite databases
# HARD CODED change location
meta_conn = sqlite3.connect('../Home_and_Kitchen/meta_Home_and_Kitchen.db')
reviews_conn = sqlite3.connect('../Home_and_Kitchen/Home_and_Kitchen_reviews.db')


In [18]:
def calculate_distribution(df):
    # Calculate distribution by 'categories', 'rating', and 'month'
    return df.groupby(['categories', 'rating', 'month']).size().reset_index(name='review_count')



def sample_by_year(year):
    # Step 1: Load filtered data for reviews for a specific year from the reviews database
    reviews_query = f"""
    SELECT * 
    FROM reviews 
    WHERE timestamp BETWEEN strftime('%s', '{year}-01-01') * 1000 AND strftime('%s', '{year}-12-31') * 1000
    """
    reviews_df = pd.read_sql_query(reviews_query, reviews_conn)

    # Step 2: Load metadata from the meta database
    meta_query = "SELECT * FROM products"
    meta_df = pd.read_sql_query(meta_query, meta_conn)

    # Step 3: Convert the timestamp to a datetime and extract quarter information for seasonality
    reviews_df['timestamp'] = pd.to_datetime(reviews_df['timestamp'], unit='ms')
    # reviews_df['quarter'] = reviews_df['timestamp'].dt.to_period('Q')  # Group by quarter for seasonality
    reviews_df['month'] = reviews_df['timestamp'].dt.to_period('M')  # Group by quarter for seasonality

    # Step 4: Merge the filtered reviews with the metadata on 'parent_asin'
    merged_df = pd.merge(reviews_df, meta_df, on='parent_asin', how='left')
    print('reviews_df.size',reviews_df.shape,'meta_df.size',meta_df.shape,'merged_df.size',merged_df.shape)

    del reviews_df
    del meta_df

    # Step 5: Stratified sampling by category, rating, and quarter
    sampling_percentage = 0.01  # Adjust this percentage based on your needs
    stratified_sample = merged_df.groupby(['categories', 'rating', 'month'], group_keys=False).apply(
        lambda x: x.sample(frac=sampling_percentage, random_state=42))

    original_distribution = calculate_distribution(merged_df)
    # Step 6: Return the sampled data for this year
    return stratified_sample,original_distribution

def compare_distributions(original_distribution, sampled_distribution):
    
    # Merge the two distributions for comparison
    comparison_df = pd.merge(original_distribution, sampled_distribution, 
                             on=['categories', 'rating', 'month'], 
                             how='outer', 
                             suffixes=('_original', '_sampled'))

    # Fill NaNs with 0 for cases where one dataset has no data for a combination
    comparison_df.fillna(0, inplace=True)
    # print(comparison_df.columns)
    # Calculate the ratio of the sample to the original data
    comparison_df['sample_to_original_ratio'] = comparison_df['review_count_sampled'] / comparison_df['review_count_original']
    
    return comparison_df


In [3]:
sampled_2018,distribution_2018 = sample_by_year(2018)

reviews_df.size (5811300, 10) meta_df.size (3024319, 24) merged_df.size (5811300, 33)


In [12]:
sampled_2018.to_csv('sampled_2018.csv',index=False)
distribution_2018.to_csv('distribution_2018.csv',index=False)
sampled_2018.head()

Unnamed: 0,rating,title_x,text,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,month,...,style,pattern,care_instructions,unit_count,dimensions,num_items,item_weight,best_sellers_rank,discontinued,date_first_available
236575,5.0,"Very nice set, is very temperamental but made ...",Really nice and cute. I bought these to put on...,B00M2IAPXK,B00M2IAPXK,AGLI372CF3E6HPQD2MDP4AVTRPWA,2018-01-12 23:07:11.420,19,1,2018-01,...,Modern,Solid,,,"11.75""W x 3""H",,1.75 pounds,"{""Kitchen & Dining"": 20540, ""Chip & Dip Sets"":...",No,"July 24, 2014"
4730138,5.0,Really like it,"I really like it, always receive good comments...",B01BGQ82LU,B01BGQ82LU,AFLFKHAFQH5K6VDDTFNPVPYMIA7A,2018-12-15 20:13:58.764,0,1,2018-12,...,Classic,,Hand Wash Only,,"10""W x 7""H",,1 pounds,"{""Kitchen & Dining"": 330853, ""Chip & Dip Sets""...",,"February 4, 2016"
5765355,1.0,Frayed material the first week! Not a good pro...,Junk it started to fray the same week I bought...,B018CWUO3K,B018CWUO3K,AHDSHS5LWJDCYJOP4WK23EOBS4SA,2018-01-29 19:39:33.495,0,1,2018-01,...,Paw Patrol,,,,3.8 x 2.6 x 16.6 inches,,1.3 Pounds,"{""Home & Kitchen"": 952995, ""Bath Rugs"": 2842}",,
2482293,1.0,Small over priced crappy product. Would never buy,Small over priced crappy product. Would never ...,B071LQZ7J7,B09K4WX5ND,AF6RRW7SK5TWJF63ZPFAHUNXLFDA,2018-01-21 21:42:14.843,2,1,2018-01,...,,Solid,,,32 x 20 x 1 inches,,1.28 pounds,"{""Home & Kitchen"": 2411, ""Bath Rugs"": 28}",No,
3067345,1.0,One Star,Returned,B00LRYF0XA,B00LRYF0XA,AEB2PFSF5AXMYWFL5AU4WW3TGCIA,2018-02-07 18:14:13.772,0,1,2018-02,...,Casual,Solid,Machine Wash,,"46""L x 30""W",,4 Pounds,{},No,"July 13, 2014"


In [13]:
sampled_2018.shape,distribution_2018.shape

((51935, 33), (87694, 4))

In [14]:
sampled_2019,distribution_2019 = sample_by_year(2019)
sampled_2019.to_csv('sampled_2019.csv',index=False)
distribution_2019.to_csv('distribution_2019.csv',index=False)

sampled_2019.head()

reviews_df.size (8181169, 10) meta_df.size (3024319, 24) merged_df.size (8181169, 33)


Unnamed: 0,rating,title_x,text,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,month,...,style,pattern,care_instructions,unit_count,dimensions,num_items,item_weight,best_sellers_rank,discontinued,date_first_available
2773793,5.0,Got what I expected!,Bought this for my mom. She absolutely loves it.,B0029MW3CA,B0029MW3CA,AFJAPPVOKDLM2S3GR2INUVEGNEKQ,2019-01-28 00:23:29.429,0,1,2019-01,...,Classic,Floral,Machine Wash,,"14.25""W x 2.5""H",,4.65 pounds,"{""Kitchen & Dining"": 192114, ""Chip & Dip Sets""...",No,"September 18, 2009"
5954080,5.0,Nice tray,Very nice!,B016KCAK10,B016KCAK10,AFVZKXBW4X4FPNLTLSDW7IA7GQFA,2019-03-07 20:39:13.659,0,1,2019-03,...,NCAA Melamine Chip and Dip Tray,,,,,,0.2 Pounds,"{""Sports & Outdoors"": 990791, ""Chip & Dip Sets...",,"October 12, 2015"
31439,5.0,Quality product.,Beautiful quality crystal bowl. Would purchase...,B01NAH35VE,B01NAH35VE,AGIMBQXCFOCI3MJ2TM6NYNY7ZKGQ,2019-04-18 13:37:46.107,0,1,2019-04,...,Classic,,,,"11.8""W x 3""H",,8.18 pounds,"{""Kitchen & Dining"": 264087, ""Chip & Dip Sets""...",Yes,"December 8, 2016"
5636037,5.0,great purchase!,"used for a family gathering, very attractive ,...",B07F5MDG1Y,B07F5MDG1Y,AEEUO4R2F2USIZGIYQL3CNEP2BBA,2019-05-07 10:44:26.182,0,1,2019-05,...,,Solid,,,,,6.54 pounds,{},No,"June 30, 2018"
2115051,5.0,Perfect!,Perfect!,B07DKRDNG9,B07DKRDNG9,AHKM2BE44RWGPZSSKU4D4HIYBZUA,2019-06-27 20:44:54.375,1,1,2019-06,...,Traditional,Floral,,,"7""W x 2""H",,1 pounds,"{""Kitchen & Dining"": 269093, ""Chip & Dip Sets""...",No,"June 6, 2018"


In [22]:
comparison_df_2018 = compare_distributions(distribution_2018,calculate_distribution(sampled_2018))
comparison_df_2019 = compare_distributions(distribution_2019,calculate_distribution(sampled_2019))


Index(['categories', 'rating', 'month', 'review_count_original',
       'review_count_sampled'],
      dtype='object')
Index(['categories', 'rating', 'month', 'review_count_original',
       'review_count_sampled'],
      dtype='object')


In [29]:
comparison_df_2018.sort_values(by=['review_count_sampled','sample_to_original_ratio'],ascending=False).head(50)

Unnamed: 0,categories,rating,month,review_count_original,review_count_sampled,sample_to_original_ratio
87693,[],5.0,2018-12,17290,173.0,0.010006
87682,[],5.0,2018-01,16870,169.0,0.010018
87684,[],5.0,2018-03,15053,151.0,0.010031
87688,[],5.0,2018-07,15082,151.0,0.010012
87689,[],5.0,2018-08,14849,148.0,0.009967
87687,[],5.0,2018-06,14452,145.0,0.010033
87691,[],5.0,2018-10,13903,139.0,0.009998
87685,[],5.0,2018-04,13941,139.0,0.009971
87683,[],5.0,2018-02,13368,134.0,0.010024
87692,[],5.0,2018-11,13428,134.0,0.009979


In [30]:
comparison_df_2019.sort_values(by=['review_count_sampled','sample_to_original_ratio'],ascending=False).head(50)


Unnamed: 0,categories,rating,month,review_count_original,review_count_sampled,sample_to_original_ratio
88210,[],5.0,2019-12,29277,293.0,0.010008
88205,[],5.0,2019-07,26860,269.0,0.010015
88204,[],5.0,2019-06,22233,222.0,0.009985
88206,[],5.0,2019-08,21390,214.0,0.010005
88209,[],5.0,2019-11,19523,195.0,0.009988
88199,[],5.0,2019-01,19049,190.0,0.009974
88203,[],5.0,2019-05,18902,189.0,0.009999
88208,[],5.0,2019-10,18819,188.0,0.00999
88207,[],5.0,2019-09,18410,184.0,0.009995
88201,[],5.0,2019-03,16550,166.0,0.01003


In [2]:
# sampled_2018 = pd.read_csv('sampled_2018.csv')
# sampled_2019 = pd.read_csv('sampled_2019.csv')


In [4]:
sampled_2018['rating'].value_counts(),sampled_2019['rating'].value_counts()

(rating
 5.0    38009
 4.0     5278
 1.0     4429
 3.0     2553
 2.0     1666
 Name: count, dtype: int64,
 rating
 5.0    57244
 4.0     6973
 1.0     6057
 3.0     3235
 2.0     2192
 Name: count, dtype: int64)

In [6]:
combined_sample = pd.concat([sampled_2018, sampled_2019])
combined_sample.to_csv('combined_sample.csv',index=False)
combined_sample.head()


Unnamed: 0,rating,title_x,text,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,month,...,style,pattern,care_instructions,unit_count,dimensions,num_items,item_weight,best_sellers_rank,discontinued,date_first_available
0,1.0,One Star,Cute hat but must be a child's size. Runs very...,B018AUB3GQ,B018AUB3GQ,AF2NUUY77O3CG4TFZOQILNSBGZPA,2018-01-28 02:07:13.534,0,1,2018-01,...,,,,,5 x 3 x 4 inches; 2 Ounces,,,{},No,"August 26, 2016"
1,1.0,DO NOT BUY FROM THIS COMPANY!,I wish that I could give 0 stars to this produ...,B00UPS94NQ,B00UPS94NQ,AH5F6M64O2JXBF6IQJUUIFJOBULQ,2018-01-19 13:28:01.542,4,1,2018-01,...,Contemporary,,,,,,,{},,"March 14, 2015"
2,1.0,One Star,childs apron,B06VWV9B8Z,B06VWV9B8Z,AGC7QXYV5K3VFYDMPJBRMOMXOUCA,2018-02-01 18:19:14.301,1,1,2018-02,...,,,,,,,2.4 Ounces,{},No,"June 11, 2019"
3,1.0,I am so disappointed this bag will barely hold...,I am so disappointed this bag will barely hold...,B0170L9AA2,B0170L9AA2,AE7JKEXUDX4DHF4SQ4HKJQQPCSIQ,2018-03-13 23:39:27.857,0,1,2018-03,...,,,,,,,,{},,"April 13, 2016"
4,1.0,... cover which is only primary colors and not...,This is NOT a beach towel but a silky cover wh...,B06XRXPKZY,B06XRXPKZY,AHQWQXKP6B3EYF7MICPNUCSGYC3A,2018-04-16 00:34:46.581,1,1,2018-04,...,,,,,,,,{},No,"March 21, 2017"


In [7]:
combined_sample.shape

(136521, 33)

In [None]:
# train, temp = train_test_split(combined_sample, test_size=0.3, stratify=combined_sample[['main_category', 'rating', 'quarter']])
# validation, test = train_test_split(temp, test_size=0.5, stratify=temp[['main_category', 'rating', 'quarter']])

# train.to_csv('train_data.csv', index=False)
# validation.to_csv('validation_data.csv', index=False)
# test.to_csv('test_data.csv', index=False)

