In [1]:
import pandas as pd

import tweepy

import config

from bots.twitter import TwitterBot
from helpers.data_helpers import save_to_parquet, clean_text, clean_tweets_df, save_to_text

2023-03-27 16:42:54,095 - INFO     | config     | Loading environment variables
2023-03-27 16:42:54,096 - INFO     | config     | Directory raw_data already exists
2023-03-27 16:42:54,096 - INFO     | config     | Directory data already exists


# Extracting Tweets

In [2]:
# initialize twitter bot
twitter_bot = TwitterBot(bearer=config.TWTR_BEARER_TOKEN,
                         api=config.TWTR_API,
                         api_secret=config.TWTR_API_SECRET,
                         access=config.TWTR_ACCESS_TOKEN,
                         access_secret=config.TWTR_ACCESS_TOKEN_SECRET)

In [3]:
queries = [
    'context:46.783337567798169600 -is:retweet lang:en',    # context: products
    'context:67.839543390668673024 -is:retweet lang:en',    # context: apparel
    'context:131.1248316002317643776 -is:retweet lang:en',  # context: athletic apparel
    'context:65.1256236649253449729 -is:retweet lang:en',   # context: fashion and beauty
    'context:131.1095391406816784384 -is:retweet lang:en',  # context: shopping
    'context:67.1486758812849635329 -is:retweet lang:en',   # context: retail sales
    'context:131.1407812892909473795 -is:retweet lang:en',  # context: ecommerce
    'context:165.1407812892909473795 -is:retweet lang:en',  # context: ecommerce
]

In [4]:
dfs = []
for query in queries:
    tweets = twitter_bot.get_recent_tweets(query=query, limit=2000)
    df = pd.DataFrame.from_dict(tweets)
    dfs.append(df)

In [5]:
tweets = pd.concat(dfs, ignore_index=True)
tweets = tweets.loc[~tweets['id'].duplicated()].copy()

In [6]:
tweets

Unnamed: 0,id,created_at,text,entities
0,1640363807070380032,2023-03-27 14:43:02+00:00,Hot take... #Cloud is a network. ☁️\n\nWatch t...,"{Juniper Networks, Products - B2B, B2B, Servic..."
1,1640363780550053898,2023-03-27 14:42:56+00:00,"@its_syncx @AMD Yikes, that gpu is a REALLY ro...","{Products - B2B, B2B, AMD}"
2,1640363753253535746,2023-03-27 14:42:49+00:00,AMD just wipes the floor with Intel CPUs. Why ...,"{Products - B2B, B2B, AMD}"
3,1640363645375770628,2023-03-27 14:42:24+00:00,@cfunk1 @RPG_Piseog @MikeyTheKid @newbiedm You...,"{Hasbro, Products - B2B, B2B}"
4,1640363437741031426,2023-03-27 14:41:34+00:00,Looking for your next opportunity? @JuniperNet...,"{Careers, B2B, Services - B2B, Products - B2B,..."
...,...,...,...,...
15243,1640359539374706689,2023-03-27 14:26:05+00:00,1 1/2 TCW F SI1 Round Cut Earth Mined Certifie...,"{Online - Retail, eCommerce industry, Industri..."
15244,1640359539290636291,2023-03-27 14:26:05+00:00,Moss Green Linen Curtain 2 Panel Green Linen N...,"{Online - Retail, eCommerce industry, Industri..."
15245,1640359539248873478,2023-03-27 14:26:05+00:00,J170-1 Meissen Franzen Collaboration Square Pl...,"{Online - Retail, eCommerce industry, Industri..."
15246,1640359539123052549,2023-03-27 14:26:05+00:00,Mystery Booster Retail Edition 2020 Draft Boos...,"{Online - Retail, eCommerce industry, Gaming B..."


In [7]:
save_to_parquet(data_dir=config.RAW_DATA_DIR, df=tweets, name='tweets')

# Cleaning Tweets

In [8]:
tweets_df = pd.read_parquet(f'{config.RAW_DATA_DIR}/tweets_2023327.parquet')

In [9]:
tweets_df['created_at'] = pd.to_datetime(tweets_df['created_at'].dt.strftime('%Y-%m-%d'))

In [10]:
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13741 entries, 0 to 13740
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   id          13741 non-null  int64         
 1   created_at  13741 non-null  datetime64[ns]
 2   text        13741 non-null  object        
 3   entities    13741 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 429.5+ KB


In [11]:
tweets_df['text'] = tweets_df['text'].map(lambda x: clean_text(x))

In [12]:
df_clean = clean_tweets_df(df=tweets_df, text_col='text', n=2)

In [13]:
df_clean

Unnamed: 0,id,created_at,entities,full_text
0,1640363807070380032,2023-03-27,"[Juniper Networks, Products - B2B, B2B, Servic...",\nTWEET: Hot take... #Cloud is a network. ☁️ W...
1,1640363780550053898,2023-03-27,"[Products - B2B, B2B, AMD]","\nTWEET: Yikes, that gpu is a REALLY rough cho..."
2,1640363753253535746,2023-03-27,"[Products - B2B, B2B, AMD]",\nTWEET: AMD just wipes the floor with Intel C...
3,1640363645375770628,2023-03-27,"[Hasbro, Products - B2B, B2B]",\nTWEET: You have a lot to be sad about but no...
4,1640363437741031426,2023-03-27,"[Careers, B2B, Services - B2B, Products - B2B,...",\nTWEET: Looking for your next opportunity?is ...
...,...,...,...,...
13610,1640359539374706689,2023-03-27,"[Online - Retail, eCommerce industry, Industri...",\nTWEET: 1 1/2 TCW F SI1 Round Cut Earth Mined...
13611,1640359539290636291,2023-03-27,"[Online - Retail, eCommerce industry, Industri...",\nTWEET: Moss Green Linen Curtain 2 Panel Gree...
13612,1640359539248873478,2023-03-27,"[Online - Retail, eCommerce industry, Industri...",\nTWEET: J170-1 Meissen Franzen Collaboration ...
13613,1640359539123052549,2023-03-27,"[Online - Retail, eCommerce industry, Gaming B...",\nTWEET: Mystery Booster Retail Edition 2020 D...


In [15]:
save_to_text(df=df_clean, col='full_text', out_dir=config.DATA_DIR, name='tweets_apparel_2023327')

2023-03-27 16:48:51,811 - INFO     | helpers.data_helpers | Saved corpus to .txt file: tweets_apparel_2023327.txt
