In [1]:
from multiprocessing_scripts import sentiment_analysis, emoji_count, emoji_index
from sklearn.model_selection import train_test_split
import pandas as pd
import sqlite3
import emoji

#### Utilities

In [2]:
def load_from_database(
        db: str,
        table: str,
        columns: str,
):
    connection = sqlite3.connect('../database/' + db + '.db')
    df = pd.read_sql_query(
        str('SELECT ' + columns + ' FROM ' + table), con=connection)
    connection.close()
    return df


def save_to_database(
        db: str,
        table: str,
        df: pd.DataFrame
):
    connection = sqlite3.connect('../database/' + db + '.db')
    try:
        df.to_sql(name=table, con=connection, if_exists='replace')
    except Exception as e:
        raise e
    print('DataFrame has been saved successfully to: ' + db)

### Removing NAs and Duplicates REVIEW

In [2]:
con_rev = sqlite3.connect('../database/review.db')
df_rev = pd.read_sql_query("SELECT * from review", con_rev)

In [None]:
df_rev['note_length'] = df_rev.apply(lambda row: len(str(row.note).split()), axis=1)

In [11]:
print(len(df_rev))
df_rev.dropna(how='any', inplace=True)
df_rev.note.dropna(how='any', inplace=True)
# Remove ratings with empty notes
df_rev = df_rev[df_rev.note_length > 0]
df_rev.id.drop_duplicates(inplace=True)
print(len(df_rev))

8518578
8518578


In [7]:
df_rev.note = df_rev.note.str.lower()
df_rev.note = df_rev.note.str.replace('vivino', '')
df_rev.note = df_rev.note.str.replace('\n', '')

In [8]:
df_rev.sort_values(by='id', inplace=True)
if 'index' in df_rev.columns:
    df_rev.drop(columns='index', inplace=True)
df_rev.to_sql('review', con=con_rev, if_exists='replace')

8518578

### Transform Emojis to CLDR short name

In [3]:
con_rev = sqlite3.connect('../database/review.db')
df_rev = pd.read_sql_query("SELECT * from review", con_rev)

In [5]:
emoji_list = list(emoji.EMOJI_DATA.keys())
emoji_list = '|'.join(emoji_list)
# *-symbol causes an error with regex -> remove
emoji_list = emoji_list.replace('*', '')

In [6]:
emoji_indexes = emoji_index.process(df=df_rev, emoji_list=emoji_list)

In [3]:
def replace_emojis(df: pd.DataFrame, indexes):
    for index in indexes:
        note: str = df.loc[index, 'note']
        emojis = emoji.distinct_emoji_list(note)
        for var_emoji in emojis:
            emoji_name = emoji.demojize(var_emoji).replace(':', '').replace('_', ' ')
            note = note.replace(var_emoji, emoji_name + ' ')
        df.loc[index, 'note'] = note
    return df

In [8]:
df_rev_no_emoji = replace_emojis(df=df_rev, indexes=emoji_indexes)

In [63]:
con_rev_no_emojis = sqlite3.connect('../database/review_no_emojis.db')
df_rev_no_emoji.sort_values(by='id', inplace=True)
if 'index' in df_rev_no_emoji.columns:
    df_rev_no_emoji.drop(columns='index', inplace=True)
df_rev_no_emoji.to_sql('review_no_emojis', con=con_rev_no_emojis, if_exists='replace')

8518578

### Spelling correction REVIEW-NO-EMOJIS

In [2]:
con_rev_sentiment = sqlite3.connect('../database/review_en_sentiment.db')
df_rev_sentiment = pd.read_sql_query("SELECT id, note FROM review_en_sentiment", con_rev_sentiment)
con_rev_sentiment.close()
df_rev_sentiment.head(n=1)

Unnamed: 0,id,note
0,3975,excellent full bodied wine


In [None]:
from neuspell import BertChecker

bc = BertChecker()
bc.from_pretrained()

In [8]:
note_sample = df_rev_sentiment.note.sample(n=20)

for note in note_sample:
    print(
        'Original: ' + note + '\n' +
        'Corrected: ' + bc.correct(note) + '\n'
    )

Original: my 1st white burgundy.  i liked it but nothing too special.  it reminded me of a lighter sauvignon blanc.
Corrected: my 1st white burgundy . i liked it but nothing too special . it reminded me of a lighter sauvignon blanc .

Original: (4.0)100% romorantin (probably cross of pn&gouais, exclusively in cour-cheverny aoc). biodynamic.med intense nose, very fresh and lively, mix of acacia, green vegetables, citrus fruits and minerals.light+ bodied, dry, very mineral, vigorous, flavors of silex, green herbs, some lime and honey.great finish, long, highly zippy with grapefruit zest.reportedly close to chablis, but that one with more fruity impact. 
Corrected: ( 4 . 0 ) 100 % romorantin ( probably cross of pink & gouais , exclusively in four - cheverny acid ) . biodynamic . med intense nose , very fresh and lively , mix of acacia , green vegetables , citrus fruits and minerals . light + bodied , dry , very mineral , vigorous , flavors of sales , green herbs , some lime and honey . gr

In [4]:
from tqdm import tqdm
from datetime import datetime

df = df_rev_sentiment
start = datetime.now()

for i in tqdm(df.index):
    df.loc[i, 'note'] = bc.correct(df.loc[i, 'note']).lower()

print(datetime.now() - start)

con_rev_sentiment_sc = sqlite3.connect('../database/review_en_sentiment_sc.db')
df.to_sql('review_en_sentiment_sc', con=con_rev_sentiment_sc, if_exists='replace')

loading vocab from path:/Users/leonbecker/neuspell/neuspell/../data/checkpoints/subwordbert-probwordnoise/vocab.pkl
initializing model


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Number of parameters in the model: 185211810
Loading model params from checkpoint dir: /Users/leonbecker/neuspell/neuspell/../data/checkpoints/subwordbert-probwordnoise


100%|██████████| 3976851/3976851 [66:48:20<00:00, 16.54it/s]    


2 days, 18:48:20.852025


3976851

In [6]:
con_rev_sentiment = sqlite3.connect('../database/review_en_sentiment.db')
df_rev_sentiment = pd.read_sql_query("SELECT * FROM review_en_sentiment", con_rev_sentiment)
con_rev_sentiment.close()
df_rev_sentiment.head(n=1)

Unnamed: 0,index,id,wine_id,rating,note,created_at,likes_count,comments_count,scan_image_path,user_id,note_length,len_code,sentiment
0,0,3975,1101258,4.0,excellent full bodied wine,2011-07-19T21:21:22.000Z,2.0,1.0,images.vivino.com/labels/1309631550_2382.jpg,32547,4,en,0.938549


In [7]:
df_sc = pd.merge(df_rev_sentiment, df, on='id')
df_sc.head()

Unnamed: 0,index,id,wine_id,rating,note_x,created_at,likes_count,comments_count,scan_image_path,user_id,note_length,len_code,sentiment,note_y
0,0,3975,1101258,4.0,excellent full bodied wine,2011-07-19T21:21:22.000Z,2.0,1.0,images.vivino.com/labels/1309631550_2382.jpg,32547,4,en,0.938549,excellent full bodied wine
1,1,4202,76378,5.0,the real price at phillipson is around 1299 dk...,2011-08-06T20:52:53.000Z,0.0,1.0,images.vivino.com/labels/1311959128_1819.jpg,48500,39,en,0.576068,the real price at phillipson is around 1299 dk...
2,2,4261,1231210,5.0,"clean, crisp and full of nuance and elegance, ...",2011-08-11T08:06:39.000Z,3.0,1.0,images.vivino.com/labels/1312622458_6592.jpg,53601,17,en,0.982237,"clean , crisp and full of nuance and elegance ..."
3,3,4264,1178663,4.0,"crisp and clean champagne, a certain fave amon...",2011-08-11T09:28:07.000Z,25.0,2.0,images.vivino.com/labels/1312966707_6129.jpg,53601,22,en,0.576195,"crisp and clean champagne , a certain face amo..."
4,4,4354,17998,4.0,it is not a champagne type wine! it is a red w...,2011-08-12T16:53:19.000Z,0.0,1.0,images.vivino.com/labels/1311842730_8268.jpg,47873,12,en,0.736812,it is not a champagne type wine ! it is a red ...


In [14]:
df_sc.drop(columns=['note_x'], inplace=True)
df_sc.rename(columns={'note_y': 'note'}, inplace=True)
df_sc.head()

Unnamed: 0,index,id,wine_id,rating,created_at,likes_count,comments_count,scan_image_path,user_id,note_length,len_code,sentiment,note
0,0,3975,1101258,4.0,2011-07-19T21:21:22.000Z,2.0,1.0,images.vivino.com/labels/1309631550_2382.jpg,32547,4,en,0.938549,excellent full bodied wine
1,1,4202,76378,5.0,2011-08-06T20:52:53.000Z,0.0,1.0,images.vivino.com/labels/1311959128_1819.jpg,48500,39,en,0.576068,the real price at phillipson is around 1299 dk...
2,2,4261,1231210,5.0,2011-08-11T08:06:39.000Z,3.0,1.0,images.vivino.com/labels/1312622458_6592.jpg,53601,17,en,0.982237,"clean , crisp and full of nuance and elegance ..."
3,3,4264,1178663,4.0,2011-08-11T09:28:07.000Z,25.0,2.0,images.vivino.com/labels/1312966707_6129.jpg,53601,22,en,0.576195,"crisp and clean champagne , a certain face amo..."
4,4,4354,17998,4.0,2011-08-12T16:53:19.000Z,0.0,1.0,images.vivino.com/labels/1311842730_8268.jpg,47873,12,en,0.736812,it is not a champagne type wine ! it is a red ...


In [25]:
con_rev_sentiment_sc = sqlite3.connect('../database/review_en_sentiment_sc.db')
if 'index' in df_sc.columns:
    df_sc.drop(columns='index', inplace=True)
df_sc.drop_duplicates(subset='id', inplace=True)
df_sc.to_sql('review_en_sentiment_sc', con=con_rev_sentiment_sc, if_exists='replace')

3976746

### Sentiment analysis

In [None]:
con_rev = sqlite3.connect('../database/review_en.db')
df_rev = pd.read_sql_query("SELECT id, note from review_en", con_rev)
results = sentiment_analysis.process(df_rev)
df_results = pd.concat(results)
del df_rev
con_reviews = sqlite3.connect('../database/review_en.db')
df_reviews = pd.read_sql_query("SELECT * from review_en", con_reviews)
df_reviews.drop(columns=['index'], inplace=True)
df_final = pd.merge(df_reviews, df_results, on='id')

con_rev_en_sent = sqlite3.connect('../database/review_en_sentiment.db')
cur_rev_en_sent = con_rev_en_sent.cursor()
cur_rev_en_sent.execute('CREATE TABLE IF NOT EXISTS review_en_sentiment('
                        'id, wine_id, rating, note, created_at, likes_count, comments_count, scan_image_path, user_id, note_length, len_code, sentiment)')
df_final.to_sql('review_en_sentiment', con=con_rev_en_sent, if_exists='replace')

In [4]:
con_rev_en = sqlite3.connect('../database/review_en.db')
df_rev_en = pd.read_sql_query("SELECT note from review_en", con_rev_en)
df_rev_en.head()

Unnamed: 0,note
0,excellent full bodied wine
1,the real price at phillipson is around 1299 dk...
2,"clean, crisp and full of nuance and elegance, ..."
3,"crisp and clean champagne, a certain fave amon..."
4,it is not a champagne type wine! it is a red w...


### Emoji count REVIEW

In [3]:
emoji_list = list(emoji.EMOJI_DATA.keys())
emoji_list = '|'.join(emoji_list)
# *-symbol causes an error with regex -> remove
emoji_list = emoji_list.replace('*', '')
emoji_count = emoji_count.process(df_rev, emoji_list)

In [6]:
(emoji_count / len(df_rev)) * 100

4.602235255696431

### Emoji count REVIEW_EN

In [5]:
emoji_list = list(emoji.EMOJI_DATA.keys())
emoji_list = '|'.join(emoji_list)
# *-symbol causes an error with regex -> remove
emoji_list = emoji_list.replace('*', '')
emoji_count_en = emoji_count.process(
    df=df_rev_en,
    emoji_list=emoji_list
)
(emoji_count_en / len(df_rev_en)) * 100

4.18031568748694

### Create datasets containing Wine and User IDs with at least 5 occurrences

In [4]:
df_rev_en_sent = load_from_database(db='review_en_sentiment', table='review_en_sentiment', columns='user_id, wine_id')

In [5]:
var_min = 5
var_max = 1000
five_to_x_user = df_rev_en_sent[df_rev_en_sent['wine_id'].isin(df_rev_en_sent['wine_id'].value_counts()[
                                                     (df_rev_en_sent['wine_id'].value_counts() >= var_min)  &
                                                     (df_rev_en_sent['wine_id'].value_counts() <= var_max)
                                                 ].index)]['wine_id']
df_temp = df_rev_en_sent[df_rev_en_sent['wine_id'].isin(five_to_x_user)]
minimum_5_wines = df_temp[df_temp['wine_id'].isin(df_temp['wine_id'].value_counts()[df_temp['wine_id'].value_counts() >= var_min].index)]['wine_id']

In [6]:
five_to_x_user.drop_duplicates(inplace=True)
minimum_5_wines.drop_duplicates( inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  five_to_x_user.drop_duplicates(inplace=True)


In [6]:
save_to_database(db='five_to_100', table='user', df=five_to_x_user)
save_to_database(db='five_to_100', table='wine', df=minimum_5_wines)

DataFrame has been saved successfully to: five_to_100
DataFrame has been saved successfully to: five_to_100


### Training, Test & Validation Dataset

#### REVIEW_EN (no spelling correction)

In [11]:
df_rev_en_sent = load_from_database(db='review_en_sentiment', table='review_en_sentiment', columns='*')
df_rev_en_sent.head(n=2)

Unnamed: 0,index,id,wine_id,rating,note,created_at,likes_count,comments_count,scan_image_path,user_id,note_length,len_code,sentiment
0,0,3975,1101258,4.0,excellent full bodied wine,2011-07-19T21:21:22.000Z,2.0,1.0,images.vivino.com/labels/1309631550_2382.jpg,32547,4,en,0.938549
1,1,4202,76378,5.0,the real price at phillipson is around 1299 dk...,2011-08-06T20:52:53.000Z,0.0,1.0,images.vivino.com/labels/1311959128_1819.jpg,48500,39,en,0.576068


In [8]:
# five_to_100_user = load_from_database(db='five_to_100', table='user', columns='*')
# minimum_5_wines = load_from_database(db='five_to_100', table='wine', columns='*')

In [12]:
df_rev_en_sent = df_rev_en_sent[
    (df_rev_en_sent['wine_id'].isin(five_to_x_user)) &
    (df_rev_en_sent['wine_id'].isin(minimum_5_wines))
    ]

In [18]:
df_rev_en_sent

Unnamed: 0,index,id,wine_id,rating,note,likes_count,comments_count,user_id,note_length,len_code,sentiment
0,0,3975,1101258,4.0,excellent full bodied wine,2.0,1.0,32547,4,en,0.938549
1,1,4202,76378,5.0,the real price at phillipson is around 1299 dk...,0.0,1.0,48500,39,en,0.576068
2,2,4261,1231210,5.0,"clean, crisp and full of nuance and elegance, ...",3.0,1.0,53601,17,en,0.982237
4,4,4354,17998,4.0,it is not a champagne type wine! it is a red w...,0.0,1.0,47873,12,en,0.736812
6,6,4393,1213,5.0,"nothing to do with wine, nothing to do with vi...",3.0,1.0,53601,43,en,0.771274
...,...,...,...,...,...,...,...,...,...,...,...
3976844,3976844,257812082,1523914,4.4,"nez rond, gourmand",0.0,0.0,4787537,3,en,0.790421
3976847,3976847,257819303,1665120,3.8,"i did not aerate the wine, but allowed the aro...",0.0,0.0,32988909,81,en,0.822034
3976848,3976848,257820106,1841279,3.0,on the sweeter side for a chenin blanc with no...,0.0,0.0,36708473,14,en,0.843707
3976849,3976849,257821363,2374586,4.4,"grand cru. well made. bright, delightful, wi...",2.0,1.0,1967368,23,en,0.977111


In [14]:
df_rev_en_sent.drop(columns=['created_at', 'scan_image_path'], inplace=True)

In [15]:
df_rev_en_sent.head()

Unnamed: 0,index,id,wine_id,rating,note,likes_count,comments_count,user_id,note_length,len_code,sentiment
0,0,3975,1101258,4.0,excellent full bodied wine,2.0,1.0,32547,4,en,0.938549
1,1,4202,76378,5.0,the real price at phillipson is around 1299 dk...,0.0,1.0,48500,39,en,0.576068
2,2,4261,1231210,5.0,"clean, crisp and full of nuance and elegance, ...",3.0,1.0,53601,17,en,0.982237
4,4,4354,17998,4.0,it is not a champagne type wine! it is a red w...,0.0,1.0,47873,12,en,0.736812
6,6,4393,1213,5.0,"nothing to do with wine, nothing to do with vi...",3.0,1.0,53601,43,en,0.771274


In [16]:
test, train = train_test_split(df_rev_en_sent, test_size=0.8, random_state=26)

In [17]:
save_to_database(db='review_en_train', table='review_en_train', df=train)
save_to_database(db='review_en_test', table='review_en_test', df=test)

DataFrame has been saved successfully to: review_en_train
DataFrame has been saved successfully to: review_en_test


#### REVIEW_EN_SC (with spelling correction)

In [11]:
df_rev_en_sent_sc = load_from_database(db='review_en_sentiment_sc', table='review_en_sentiment_sc', columns='*')
df_rev_en_sent_sc.head(n=2)

Unnamed: 0,index,id,wine_id,rating,created_at,likes_count,comments_count,scan_image_path,user_id,note_length,len_code,sentiment,note
0,0,3975,1101258,4.0,2011-07-19T21:21:22.000Z,2.0,1.0,images.vivino.com/labels/1309631550_2382.jpg,32547,4,en,0.938549,excellent full bodied wine
1,1,4202,76378,5.0,2011-08-06T20:52:53.000Z,0.0,1.0,images.vivino.com/labels/1311959128_1819.jpg,48500,39,en,0.576068,the real price at phillipson is around 1299 dk...


In [12]:
df_rev_en_sent_sc = df_rev_en_sent_sc[df_rev_en_sent_sc['wine_id'].isin(five_to_x_user)]

In [13]:
df_rev_en_sent_sc

Unnamed: 0,index,id,wine_id,rating,created_at,likes_count,comments_count,scan_image_path,user_id,note_length,len_code,sentiment,note
0,0,3975,1101258,4.0,2011-07-19T21:21:22.000Z,2.0,1.0,images.vivino.com/labels/1309631550_2382.jpg,32547,4,en,0.938549,excellent full bodied wine
1,1,4202,76378,5.0,2011-08-06T20:52:53.000Z,0.0,1.0,images.vivino.com/labels/1311959128_1819.jpg,48500,39,en,0.576068,the real price at phillipson is around 1299 dk...
2,2,4261,1231210,5.0,2011-08-11T08:06:39.000Z,3.0,1.0,images.vivino.com/labels/1312622458_6592.jpg,53601,17,en,0.982237,"clean , crisp and full of nuance and elegance ..."
4,4,4354,17998,4.0,2011-08-12T16:53:19.000Z,0.0,1.0,images.vivino.com/labels/1311842730_8268.jpg,47873,12,en,0.736812,it is not a champagne type wine ! it is a red ...
6,6,4393,1213,5.0,2011-08-13T17:51:31.000Z,3.0,1.0,images.vivino.com/labels/1312976199_3560.jpg,53601,43,en,0.771274,"nothing to do with wine , nothing to do with v..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3976739,3977264,257812082,1523914,4.4,2022-10-18T19:25:15.000Z,0.0,0.0,images.vivino.com/labels/J6_8eKx1TdqLh_wVjSiLV...,4787537,3,en,0.790421,"next round , gourmand"
3976742,3977267,257819303,1665120,3.8,2022-10-18T21:00:39.000Z,0.0,0.0,images.vivino.com/labels/mtFOflgRROaBV0c_g7db2...,32988909,81,en,0.822034,"I did not aerate the wine , but allowed the ar..."
3976743,3977268,257820106,1841279,3.0,2022-10-18T21:16:24.000Z,0.0,0.0,images.vivino.com/thumbs/SMjVOSwnQP-UdzNW5ARg2...,36708473,14,en,0.843707,on the sweeter side of a chenin blanc with not...
3976744,3977269,257821363,2374586,4.4,2022-10-18T21:43:26.000Z,2.0,1.0,images.vivino.com/labels/jr6Ktb2gStGMQjxhrsf2c...,1967368,23,en,0.977111,"grand cru . well made . bright , delightful , ..."


In [14]:
df_rev_en_sent_sc.drop(columns=['scan_image_path', 'created_at'], inplace=True)

In [15]:
df_rev_en_sent_sc['wine_id'].value_counts()

3117515     1000
1174700      999
4388933      998
82690        998
1128250      997
            ... 
6878339        5
3631406        5
9824165        5
7166672        5
10831702       5
Name: wine_id, Length: 10177, dtype: int64

### Cold start data set

In [8]:
df_rev_en_sent = load_from_database(db='review_en_sentiment', table='review_en_sentiment', columns='*')
test, train = train_test_split(df_rev_en_sent, test_size=0.8, random_state=26)
del df_rev_en_sent

In [9]:
_min = 1
_max = 5
five_to_x = train[train['user_id'].isin(train['user_id'].value_counts()[
                                            (train['user_id'].value_counts() >= _min) &
                                            (train['user_id'].value_counts() <= _max)
                                            ].index)]['user_id']
five_to_x.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  five_to_x.drop_duplicates(inplace=True)


In [10]:
train = train[train['user_id'].isin(five_to_x)]

In [12]:
train.drop(columns=['created_at', 'scan_image_path', 'index'], inplace=True)
print(len(train))
train.head()

In [15]:
save_to_database(db='review_en_train_cold', table='review_en_train_cold', df=test)

DataFrame has been saved successfully to: review_en_train_cold


### Word count

In [3]:
df_rev_en_train = load_from_database(db='review_en_train', table='review_en_train', columns='*')
df_rev_en_test = load_from_database(db='review_en_test', table='review_en_test', columns='*')

In [27]:
high_word_train = df_rev_en_train['note_length'].quantile(q=0.6)
high_word_test = df_rev_en_test['note_length'].quantile(q=0.6)

In [28]:
df_rev_en_train_high_word = df_rev_en_train[df_rev_en_train['note_length'] >= high_word_train]
df_rev_en_test_high_word = df_rev_en_test[df_rev_en_test['note_length'] >= high_word_test]

In [29]:
len(list(set(df_rev_en_test_high_word['user_id']) & set(df_rev_en_train_high_word['user_id'])))

39964

In [30]:
save_to_database(db='review_en_train_high_word', table='review_en_train_high_word', df=df_rev_en_train_high_word)
save_to_database(db='review_en_test_high_word', table='review_en_test_high_word', df=df_rev_en_test_high_word)

DataFrame has been saved successfully to: review_en_train_high_word
DataFrame has been saved successfully to: review_en_test_high_word


In [31]:
low_word_train = df_rev_en_train['note_length'].quantile(q=0.4)
low_word_test = df_rev_en_test['note_length'].quantile(q=0.4)

In [34]:
df_rev_en_train_low_word = df_rev_en_train[df_rev_en_train['note_length'] <= low_word_train]
df_rev_en_test_low_word = df_rev_en_test[df_rev_en_test['note_length'] <= low_word_test]

In [35]:
len(list(set(df_rev_en_train_low_word['user_id']) & set(df_rev_en_test_low_word['user_id'])))

52232

In [36]:
save_to_database(db='review_en_train_low_word', table='review_en_train_low_word', df=df_rev_en_train_low_word)
save_to_database(db='review_en_test_low_word', table='review_en_test_low_word', df=df_rev_en_test_low_word)

DataFrame has been saved successfully to: review_en_train_low_word
DataFrame has been saved successfully to: review_en_test_low_word
