In [1]:
import time
import pandas as pd

import config

from bots.twitter import TwitterBot
from helpers.data_helpers import save_to_parquet, clean_text, clean_tweets_df, save_to_text

2023-04-02 15:39:01,114 - INFO     | config     | Loading environment variables


# Extracting Tweets

In [2]:
# initialize twitter bot
twitter_bot = TwitterBot(bearer=config.TWTR_BEARER_TOKEN,
                         api=config.TWTR_API,
                         api_secret=config.TWTR_API_SECRET,
                         access=config.TWTR_ACCESS_TOKEN,
                         access_secret=config.TWTR_ACCESS_TOKEN_SECRET)

In [3]:
apparel = [
    'context:67.839543390668673024 -is:retweet lang:en',    # context: apparel
    'context:131.1248316002317643776 -is:retweet lang:en',  # context: athletic apparel
    'context:65.1256236649253449729 -is:retweet lang:en',   # context: fashion and beauty
    'context:131.1095391406816784384 -is:retweet lang:en',  # context: shopping
    'context:67.1486758812849635329 -is:retweet lang:en',   # context: retail sales
    'context:131.1407812892909473795 -is:retweet lang:en',  # context: ecommerce
    'context:165.1407812892909473795 -is:retweet lang:en',  # context: ecommerce
]

In [4]:
cars = [
    'context:46.781974597251964928 -is:retweet lang:en',    # context: car rental/sharing
    'context:45.781972125171060736 -is:retweet lang:en',    # context: auto
    'context:46.781972125179518977 -is:retweet lang:en',    # context: auto manufacturer
    'context:45.1196845866138533888 -is:retweet lang:en',   # context: automobile brands
    'context:131.1196845866138533888 -is:retweet lang:en',  # context: automobile brands
    'context:66.1527619563302420480 -is:retweet lang:en',   # context: cars
    'context:66.847528646185070592 -is:retweet lang:en',    # context: luxury cars
    'context:66.1177213956856799232 -is:retweet lang:en',   # context: used cars
]

In [5]:
beauty = [
    'context:65.850395585941086209 -is:retweet lang:en',    # context: beauty
    'context:131.1352035407127429120 -is:retweet lang:en',  # context: beauty industry
    'context:46.781974597222604800 -is:retweet lang:en',    # context: beauty/hygiene
    'context:131.1357424577974992897 -is:retweet lang:en',  # context: makeup collaborations
    'context:131.1354133238709542912 -is:retweet lang:en',  # context: new makeup
    'context:67.1400874108556955657 -is:retweet lang:en',   # context: eye makeup
    'context:66.854761964874088448 -is:retweet lang:en',    # context: hair care
    'context:66.855107826351357952 -is:retweet lang:en',    # context: skin care
]

In [17]:
dfs = []
for query in beauty:
    tweets = twitter_bot.get_recent_tweets(query=query, limit=5000)
    df = pd.DataFrame.from_dict(tweets)
    dfs.append(df)
    time.sleep(60)

In [18]:
tweets = pd.concat(dfs, ignore_index=True)
tweets = tweets.loc[~tweets['id'].duplicated()].copy()

In [19]:
print(len(tweets))

25697


In [20]:
save_to_parquet(data_dir=config.RAW_DATA_DIR, df=tweets, name='tweets_beauty_25k')

2023-04-02 17:50:14,494 - INFO     | helpers.data_helpers | tweets_beauty_25k.parquet saved.


# Dataset Creation

In [21]:
import glob

files = glob.glob(f'{config.RAW_DATA_DIR}/*_202342.parquet')
files

['raw_data/tweets_apparel_28k_202342.parquet',
 'raw_data/tweets_cars_19k_202342.parquet',
 'raw_data/tweets_beauty_25k_202342.parquet']

In [22]:
# combining dataframes
dfs_list = []
for file in files:
    df = pd.read_parquet(file)
    dfs_list.append(df)
dfs = pd.concat(dfs_list, ignore_index=True)

In [23]:
print(f'Total Tweets before duplicates: {len(dfs)}')
dfs = dfs.loc[~dfs['id'].duplicated()].copy()
print(f'Total Tweets after duplicates: {len(dfs)}')

Total Tweets before duplicates: 73637
Total Tweets after duplicates: 72949


In [24]:
dfs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 72949 entries, 0 to 73636
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype              
---  ------      --------------  -----              
 0   id          72949 non-null  int64              
 1   created_at  72949 non-null  datetime64[ns, UTC]
 2   text        72949 non-null  object             
 3   entities    72949 non-null  object             
dtypes: datetime64[ns, UTC](1), int64(1), object(2)
memory usage: 2.8+ MB


In [25]:
# converting dates
dfs['created_at'] = pd.to_datetime(dfs['created_at'].dt.strftime('%Y-%m-%d'))

In [26]:
# cleaning text
dfs['text'] = dfs['text'].map(lambda x: clean_text(x))

In [27]:
tweets_df = clean_tweets_df(df=dfs, text_col='text', n=2)
save_to_parquet(data_dir=config.DATA_DIR, df=tweets_df, name='all_tweets')
print(f'Total tweets after removing short Tweets: {len(tweets_df)}')

2023-04-02 17:50:52,810 - INFO     | helpers.data_helpers | all_tweets.parquet saved.


Total tweets after removing short Tweets: 72370


In [28]:
tweets_df.head()

Unnamed: 0,id,created_at,entities,full_text
0,1642531363663347712,2023-04-02,"[Retail industry, Everyday style, H&M , Fashio...",\nTWEET: takes individuals such feelings ounas...
1,1642531363285868547,2023-04-02,"[Retail industry, Everyday style, H&M , Fashio...","\nTWEET: taken up the rubicon, and can be insp..."
2,1642531359712264193,2023-04-02,"[Retail industry, Everyday style, H&M , Fashio...",\nTWEET: resentment like some even referred to...
3,1642531359267725313,2023-04-02,"[Fashion accessories, Fashion, General Fashion...",\nTWEET: save 65.0% on select products from st...
4,1642531358617509890,2023-04-02,"[Retail industry, Everyday style, H&M , Fashio...","\nTWEET: school, not would either be jerked of..."


### Shuffling and Slicing

In [38]:
tweets_df = pd.read_parquet(f'{config.DATA_DIR}/all_tweets_2023327.parquet')

In [39]:
tweets_df = tweets_df.sample(frac=1).reset_index(drop=True)

In [40]:
idx = int(len(tweets_df) * 0.7)
idx

54849

In [35]:
input_tweets = tweets_df[:idx]
eval_tweets = tweets_df[idx:]

In [37]:
save_to_text(df=input_tweets, col='full_text', out_dir=config.DATA_DIR, name='input_tweets_202342')
save_to_parquet(data_dir=config.DATA_DIR, df=eval_tweets, name='eval_tweets')

2023-04-02 17:51:50,215 - INFO     | helpers.data_helpers | Saved corpus to .txt file: input_tweets_202342.txt
2023-04-02 17:51:50,261 - INFO     | helpers.data_helpers | eval_tweets.parquet saved.
