In [1]:
import pandas as pd

import tweepy

import config

from bots.twitter import TwitterBot
from helpers.data_helpers import save_to_parquet, clean_text, clean_tweets_df, save_to_text

2023-03-20 17:32:20,260 - INFO     | config     | Loading environment variables
2023-03-20 17:32:20,268 - INFO     | config     | Directory raw_data already exists
2023-03-20 17:32:20,268 - INFO     | config     | Directory data already exists


# Extracting Tweets

In [2]:
# initialize twitter bot
twitter_bot = TwitterBot(bearer=config.TWTR_BEARER_TOKEN,
                         api=config.TWTR_API,
                         api_secret=config.TWTR_API_SECRET,
                         access=config.TWTR_ACCESS_TOKEN,
                         access_secret=config.TWTR_ACCESS_TOKEN_SECRET)

In [69]:
queries = [
    'context:46.783337567798169600 -is:retweet lang:en',    # context: products
    'context:67.839543390668673024 -is:retweet lang:en',    # context: apparel
    'context:131.1248316002317643776 -is:retweet lang:en',  # context: athletic apparel
    'context:65.1256236649253449729 -is:retweet lang:en',   # context: fashion and beauty
    'context:131.1095391406816784384 -is:retweet lang:en',  # context: shopping
    'context:67.1486758812849635329 -is:retweet lang:en',   # context: retail sales
    'context:131.1407812892909473795 -is:retweet lang:en',  # context: ecommerce
    'context:165.1407812892909473795 -is:retweet lang:en',  # context: ecommerce
]

In [70]:
dfs = []
for query in queries:
    tweets = twitter_bot.get_recent_tweets(query=query, limit=1000)
    df = pd.DataFrame.from_dict(tweets)
    dfs.append(df)

In [71]:
tweets = pd.concat(dfs, ignore_index=True)
tweets = tweets.loc[~tweets['id'].duplicated()].copy()

In [72]:
tweets

Unnamed: 0,id,created_at,text,entities
0,1637851614487760897,2023-03-20 16:20:29+00:00,@WardCapMgmt @AMD @nvidia Oh ok! I am a long t...,"{B2B, AMD, Products - B2B, NVIDIA}"
1,1637851526482780164,2023-03-20 16:20:08+00:00,"FBN Holdings Plc, Union Bank, Honeywell Flour ...","{B2B, Honeywell, Services - B2B, Products - B2B}"
2,1637851409822650368,2023-03-20 16:19:40+00:00,"@TaozenTaiji Hi Taozen, Thank you for reachng ...","{Services - B2B, Products - B2B, B2B, S&P 500,..."
3,1637851374359662593,2023-03-20 16:19:32+00:00,"MY MACHINE DOES NOT SUPPORT THIS INSTRUCTION, ...","{B2B, AMD, Products - B2B}"
4,1637851181715185664,2023-03-20 16:18:46+00:00,Baffles me that they put the effort into doing...,"{B2B, Hasbro, Products - B2B}"
...,...,...,...,...
7995,1637849681106735104,2023-03-20 16:12:48+00:00,Check out this item in my Etsy shop https://t....,"{Industries, Online - Retail, eCommerce indust..."
7996,1637849678233600002,2023-03-20 16:12:47+00:00,Check out this listing I just added to my #Pos...,"{Industries, Holidays, Poshmark, Cultural even..."
7997,1637849673854763008,2023-03-20 16:12:46+00:00,Check out this listing I just added to my #Pos...,"{Industries, Poshmark, Online - Retail, eComme..."
7998,1637849671904423936,2023-03-20 16:12:46+00:00,Check out this listing I just added to my #Pos...,"{Industries, Poshmark, Dance, Online - Retail,..."


In [73]:
save_to_parquet(data_dir=config.RAW_DATA_DIR, df=tweets, name='tweets')

# Cleaning Tweets

In [2]:
tweets_df = pd.read_parquet(f'{config.RAW_DATA_DIR}/tweets_2023320.parquet')

In [3]:
tweets_df['created_at'] = pd.to_datetime(tweets_df['created_at'].dt.strftime('%Y-%m-%d'))

In [4]:
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7360 entries, 0 to 7359
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   id          7360 non-null   int64         
 1   created_at  7360 non-null   datetime64[ns]
 2   text        7360 non-null   object        
 3   entities    7360 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 230.1+ KB


In [5]:
tweets_df['text'] = tweets_df['text'].map(lambda x: clean_text(x))

In [6]:
df_clean = clean_tweets_df(df=tweets_df, text_col='text', n=2)

In [7]:
df_clean

Unnamed: 0,id,created_at,entities,full_text
0,1637851614487760897,2023-03-20,"[B2B, AMD, Products - B2B, NVIDIA]",\nTWEET: Oh ok! I am a long term investor
1,1637851526482780164,2023-03-20,"[B2B, Honeywell, Services - B2B, Products - B2B]","\nTWEET: FBN Holdings Plc, Union Bank, Honeywe..."
2,1637851409822650368,2023-03-20,"[Services - B2B, Products - B2B, B2B, S&P 500,...","\nTWEET: Hi Taozen, Thank you for reachng out..."
3,1637851374359662593,2023-03-20,"[B2B, AMD, Products - B2B]",\nTWEET: MY MACHINE DOES NOT SUPPORT THIS INST...
4,1637851181715185664,2023-03-20,"[B2B, Hasbro, Products - B2B]",\nTWEET: Baffles me that they put the effort i...
...,...,...,...,...
7294,1637849681106735104,2023-03-20,"[Industries, Online - Retail, eCommerce indust...",\nTWEET: Check out this item in my Etsy shop
7295,1637849678233600002,2023-03-20,"[Industries, Holidays, Poshmark, Cultural even...",\nTWEET: Check out this listing I just added t...
7296,1637849673854763008,2023-03-20,"[Industries, Poshmark, Online - Retail, eComme...",\nTWEET: Check out this listing I just added t...
7297,1637849671904423936,2023-03-20,"[Industries, Poshmark, Dance, Online - Retail,...",\nTWEET: Check out this listing I just added t...
