In [1]:
import pandas as pd

#import tweepy

import config

from bots.twitter import TwitterBot
from helpers.data_helpers import save_to_parquet, clean_text, clean_tweets_df, save_to_text

2023-03-30 14:58:55,394 - INFO     | config     | Loading environment variables
2023-03-30 14:58:55,398 - INFO     | config     | Directory raw_data already exists
2023-03-30 14:58:55,404 - INFO     | config     | Directory data already exists


# Extracting Tweets

In [47]:
# initialize twitter bot
twitter_bot = TwitterBot(bearer=config.TWTR_BEARER_TOKEN,
                         api=config.TWTR_API,
                         api_secret=config.TWTR_API_SECRET,
                         access=config.TWTR_ACCESS_TOKEN,
                         access_secret=config.TWTR_ACCESS_TOKEN_SECRET)

In [48]:
apparel = [
    'context:67.839543390668673024 -is:retweet lang:en',    # context: apparel
    'context:131.1248316002317643776 -is:retweet lang:en',  # context: athletic apparel
    'context:65.1256236649253449729 -is:retweet lang:en',   # context: fashion and beauty
    'context:131.1095391406816784384 -is:retweet lang:en',  # context: shopping
    'context:67.1486758812849635329 -is:retweet lang:en',   # context: retail sales
    'context:131.1407812892909473795 -is:retweet lang:en',  # context: ecommerce
    'context:165.1407812892909473795 -is:retweet lang:en',  # context: ecommerce
]

In [49]:
cars = [
    'context:46.781974597251964928 -is:retweet lang:en',    # context: car rental/sharing
    'context:45.781972125171060736 -is:retweet lang:en',    # context: auto
    'context:46.781972125179518977 -is:retweet lang:en',    # context: auto manufacturer
    'context:45.1196845866138533888 -is:retweet lang:en',   # context: automobile brands
    'context:131.1196845866138533888 -is:retweet lang:en',  # context: automobile brands
    'context:66.1527619563302420480 -is:retweet lang:en',   # context: cars
    'context:66.847528646185070592 -is:retweet lang:en',    # context: luxury cars
    'context:66.1177213956856799232 -is:retweet lang:en',   # context: used cars
]

In [50]:
beauty = [
    'context:65.850395585941086209 -is:retweet lang:en',    # context: beauty
    'context:131.1352035407127429120 -is:retweet lang:en',  # context: beauty industry
    'context:46.781974597222604800 -is:retweet lang:en',    # context: beauty/hygiene
    'context:131.1357424577974992897 -is:retweet lang:en',  # context: makeup collaborations
    'context:131.1354133238709542912 -is:retweet lang:en',  # context: new makeup
    'context:67.1400874108556955657 -is:retweet lang:en',   # context: eye makeup
    'context:66.854761964874088448 -is:retweet lang:en',    # context: hair care
    'context:66.855107826351357952 -is:retweet lang:en',    # context: skin care
]

In [71]:
dfs = []
for query in beauty:
    tweets = twitter_bot.get_recent_tweets(query=query, limit=5000)
    df = pd.DataFrame.from_dict(tweets)
    dfs.append(df)

In [72]:
tweets = pd.concat(dfs, ignore_index=True)
tweets = tweets.loc[~tweets['id'].duplicated()].copy()

In [73]:
print(len(tweets))

27383


In [74]:
save_to_parquet(data_dir=config.RAW_DATA_DIR, df=tweets, name='tweets_beauty_27k')

# Dataset Creation

In [75]:
import glob

files = glob.glob(f'{config.RAW_DATA_DIR}/*_2023328.parquet')
files

['raw_data/tweets_cars_20k_2023328.parquet',
 'raw_data/tweets_beauty_27k_2023328.parquet',
 'raw_data/tweets_apparel_27k_2023328.parquet']

In [76]:
# combining dataframes
dfs_list = []
for file in files:
    df = pd.read_parquet(file)
    dfs_list.append(df)
dfs = pd.concat(dfs_list, ignore_index=True)

In [77]:
print(f'Total Tweets before duplicates: {len(dfs)}')
dfs = dfs.loc[~dfs['id'].duplicated()].copy()
print(f'Total Tweets after duplicates: {len(dfs)}')

Total Tweets before duplicates: 75578
Total Tweets after duplicates: 74653


In [78]:
dfs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 74653 entries, 0 to 75577
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype              
---  ------      --------------  -----              
 0   id          74653 non-null  int64              
 1   created_at  74653 non-null  datetime64[ns, UTC]
 2   text        74653 non-null  object             
 3   entities    74653 non-null  object             
dtypes: datetime64[ns, UTC](1), int64(1), object(2)
memory usage: 2.8+ MB


In [79]:
# converting dates
dfs['created_at'] = pd.to_datetime(dfs['created_at'].dt.strftime('%Y-%m-%d'))

In [80]:
# cleaning text
dfs['text'] = dfs['text'].map(lambda x: clean_text(x))

In [81]:
tweets_df = clean_tweets_df(df=dfs, text_col='text', n=2)
save_to_parquet(data_dir=config.DATA_DIR, df=tweets_df, name='all_tweets')
print(f'Total tweets after removing short Tweets: {len(tweets_df)}')

Total tweets after removing short Tweets: 73847


In [82]:
tweets_df.head()

Unnamed: 0,id,created_at,entities,full_text
0,1640732683218087938,2023-03-28,"[Lyft, Travel & Transportation Business, Car R...",\nTWEET: don’t care about the drivers
1,1640732633880510466,2023-03-28,"[Lyft, Travel & Transportation Business, Car R...","\nTWEET: lyft founders to step down, former am..."
2,1640732621314326529,2023-03-28,"[Travel & Transportation Business, Car Rental/...",\nTWEET: i lost money in the uber and i even t...
3,1640732585729970179,2023-03-28,"[Technology, Travel & Transportation Business,...",\nTWEET: america walks is hosting a webinar on...
4,1640732551831535616,2023-03-28,"[Travel & Transportation Business, Car Rental/...","\nTWEET: well, at least this fucker found a pl..."


### Shuffling and Slicing

In [66]:
tweets_df

Unnamed: 0,id,created_at,entities,full_text
0,1640367128854564864,2023-03-27,"[Entertainment & Leisure Business, B2B, Baseba...",\nTWEET: hahdhdh thats why he is practicing i ...
1,1640367079076376579,2023-03-27,"[B2B, Services - B2B, Technology Business, Pro...",\nTWEET: that's right - we're using #ai to hel...
2,1640366751463374853,2023-03-27,"[Hasbro, Products - B2B, B2B]",\nTWEET: indy goes trick or treating as two fa...
3,1640366569673850881,2023-03-27,"[B2B, Services - B2B, Technology Business, Dat...",\nTWEET: forget the diy headaches when you cho...
4,1640366205818003462,2023-03-27,"[Samsung , NVIDIA, B2B, AMD, Samsung USA, Sams...",\nTWEET: the global semiconductor supply chain...
...,...,...,...,...
78351,1638450493650350081,2023-03-22,"[BMW, Automotive, Aircraft & Boat Business, Us...",\nTWEET: i seem to be coming across lots of ba...
78352,1638450452625801217,2023-03-22,"[BMW, BMW - X1, Automotive, Aircraft & Boat Bu...",\nTWEET: neil james is looking very happy with...
78353,1638450366898491392,2023-03-22,"[Travel & Transportation Business, Electric ve...",\nTWEET: nobody would buy a second hand ev the...
78354,1638450260556255232,2023-03-22,"[Automotive, Used Cars, Used Vehicle, Automoti...",\nTWEET: 🤡🤡🤡🤡for that gift her dad has to go ...


In [23]:
save_to_text(df=df_clean, col='full_text', out_dir=config.DATA_DIR, name='tweets_beauty_27k_2023327')

2023-03-27 17:58:38,408 - INFO     | helpers.data_helpers | Saved corpus to .txt file: tweets_beauty_27k_2023327.txt
