In [1]:
import pandas as pd

import tweepy

import config

from bots.twitter import TwitterBot
from helpers.data_helpers import save_to_parquet, clean_text, clean_tweets_df, save_to_text

2023-03-27 17:41:13,585 - INFO     | config     | Loading environment variables
2023-03-27 17:41:13,587 - INFO     | config     | Directory raw_data already exists
2023-03-27 17:41:13,587 - INFO     | config     | Directory data already exists


# Extracting Tweets

In [2]:
# initialize twitter bot
twitter_bot = TwitterBot(bearer=config.TWTR_BEARER_TOKEN,
                         api=config.TWTR_API,
                         api_secret=config.TWTR_API_SECRET,
                         access=config.TWTR_ACCESS_TOKEN,
                         access_secret=config.TWTR_ACCESS_TOKEN_SECRET)

In [3]:
apparel = [
    'context:46.783337567798169600 -is:retweet lang:en',    # context: products
    'context:67.839543390668673024 -is:retweet lang:en',    # context: apparel
    'context:131.1248316002317643776 -is:retweet lang:en',  # context: athletic apparel
    'context:65.1256236649253449729 -is:retweet lang:en',   # context: fashion and beauty
    'context:131.1095391406816784384 -is:retweet lang:en',  # context: shopping
    'context:67.1486758812849635329 -is:retweet lang:en',   # context: retail sales
    'context:131.1407812892909473795 -is:retweet lang:en',  # context: ecommerce
    'context:165.1407812892909473795 -is:retweet lang:en',  # context: ecommerce
]

In [26]:
cars = [
    'context:46.781974597251964928 -is:retweet lang:en',    # context: car rental/sharing
    'context:45.781972125171060736 -is:retweet lang:en',    # context: auto
    'context:46.781972125179518977 -is:retweet lang:en',    # context: auto manufacturer
    'context:45.1196845866138533888 -is:retweet lang:en',   # context: automobile brands
    'context:131.1196845866138533888 -is:retweet lang:en',  # context: automobile brands
    'context:66.1527619563302420480 -is:retweet lang:en',   # context: cars
    'context:66.847528646185070592 -is:retweet lang:en',    # context: luxury cars
    'context:66.1177213956856799232 -is:retweet lang:en',   # context: used cars
]

In [12]:
beauty = [
    'context:65.850395585941086209 -is:retweet lang:en',    # context: beauty
    'context:131.1352035407127429120 -is:retweet lang:en',  # context: beauty industry
    'context:46.781974597222604800 -is:retweet lang:en',    # context: beauty/hygiene
    'context:131.1357424577974992897 -is:retweet lang:en',  # context: makeup collaborations
    'context:131.1354133238709542912 -is:retweet lang:en',  # context: new makeup
    'context:67.1400874108556955657 -is:retweet lang:en',   # context: eye makeup
    'context:66.854761964874088448 -is:retweet lang:en',    # context: hair care
    'context:66.855107826351357952 -is:retweet lang:en',    # context: skin care
]

In [None]:
dfs = []
for query in cars:
    tweets = twitter_bot.get_recent_tweets(query=query, limit=5000)
    df = pd.DataFrame.from_dict(tweets)
    print(len(df))
    dfs.append(df)

5000
5000
5000
559
5000
986
5000


In [None]:
tweets = pd.concat(dfs, ignore_index=True)
tweets = tweets.loc[~tweets['id'].duplicated()].copy()

In [None]:
tweets

In [16]:
save_to_parquet(data_dir=config.RAW_DATA_DIR, df=tweets, name='tweets_beauty_27k')

# Cleaning Tweets

In [17]:
tweets_df = pd.read_parquet(f'{config.RAW_DATA_DIR}/tweets_beauty_27k_2023327.parquet')

In [18]:
tweets_df['created_at'] = pd.to_datetime(tweets_df['created_at'].dt.strftime('%Y-%m-%d'))

In [19]:
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27252 entries, 0 to 27251
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   id          27252 non-null  int64         
 1   created_at  27252 non-null  datetime64[ns]
 2   text        27252 non-null  object        
 3   entities    27252 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 851.8+ KB


In [20]:
tweets_df['text'] = tweets_df['text'].map(lambda x: clean_text(x))

In [21]:
df_clean = clean_tweets_df(df=tweets_df, text_col='text', n=2)

In [22]:
df_clean

Unnamed: 0,id,created_at,entities,full_text
0,1640381611551105024,2023-03-27,"[Beauty, Beauty, Cosmetic & Personal Care Busi...",\nTWEET: oh wait
1,1640381610603212800,2023-03-27,"[Beauty, Beauty, Cosmetic & Personal Care Busi...",\nTWEET: i'm gonna start tweeting more. be war...
2,1640381609735241732,2023-03-27,"[Beauty, Cosmetic & Personal Care Business, Be...",\nTWEET: But has she fixed her attitude? 😂👉🏽🧠🖕🏽
3,1640381601325432833,2023-03-27,"[Beauty, Cosmetic & Personal Care Business, Be...",\nTWEET: love to see it🫶🏾
4,1640381601187086338,2023-03-27,"[Pop, Music, Selena Gomez, Beauty, Cosmetic & ...",\nTWEET: I still can't believe i will finally ...
...,...,...,...,...
26880,1640074912697139200,2023-03-26,"[Fashion & beauty, Skin care, Beauty, Beauty, ...","\nTWEET: The internet was right, this really i..."
26881,1640074601744027651,2023-03-26,"[Sunscreen, Skin care, Beauty, Cosmetic & Pers...",\nTWEET: i use retinol serum and lotion as we...
26882,1640074601500753921,2023-03-26,"[Organic beauty, Wellness and health, Skin car...",\nTWEET: NewLife Naturals USDA Certified Organ...
26883,1640074600162754561,2023-03-26,"[Sunscreen, Skin care, Beauty, Cosmetic & Pers...","\nTWEET: I switched up my skincare regimen, I’..."


In [23]:
save_to_text(df=df_clean, col='full_text', out_dir=config.DATA_DIR, name='tweets_beauty_27k_2023327')

2023-03-27 17:58:38,408 - INFO     | helpers.data_helpers | Saved corpus to .txt file: tweets_beauty_27k_2023327.txt
