In [1]:
import pandas as pd

import tweepy
import praw

import config

from bots.twitter import TwitterBot
from bots.reddit import RedditBot

from helpers.data_helpers import save_to_parquet, clean_text, clean_tweets_df

2023-03-20 16:10:27,050 - INFO     | config     | Loading environment variables


## Twitter

In [None]:
twitter_bot = TwitterBot(bearer=config.TWTR_BEARER_TOKEN,
                         api=config.TWTR_API,
                         api_secret=config.TWTR_API_SECRET,
                         access=config.TWTR_ACCESS_TOKEN,
                         access_secret=config.TWTR_ACCESS_TOKEN_SECRET)

In [None]:
#query = 'entity:"Automotive" -is:retweet lang:en'
#context: brand | entity: products
query = 'context:46.783337567798169600 -is:retweet lang:en'

In [None]:
tweets = twitter_bot.get_recent_tweets(query=query, limit=5000)

In [None]:
tweets_df = pd.DataFrame.from_dict(tweets)
tweets_df

In [None]:
save_to_parquet(data_dir=config.RAW_DATA_DIR, df=tweets_df, name='tweets_brand_products')

## Data Cleaning

In [2]:
df = pd.read_parquet(f'{config.RAW_DATA_DIR}/tweets_brand_products_2023_3_20.parquet')
df['created_at'] = pd.to_datetime(df['created_at'].dt.strftime('%Y-%m-%d'))

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   id          5000 non-null   int64         
 1   created_at  5000 non-null   datetime64[ns]
 2   text        5000 non-null   object        
 3   entities    5000 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 156.4+ KB


In [4]:
df.head()

Unnamed: 0,id,created_at,text,entities
0,1637818273336270849,2023-03-20,AMD Allegedly Testing Hybrid Processor with Ze...,"[AMD, B2B, Products - B2B]"
1,1637818157493809152,2023-03-20,Gold Box Deal of the Day: Up to 50% off Play-D...,"[B2B, Hasbro, Products - B2B]"
2,1637818089470394369,2023-03-20,Incredible https://t.co/10PfVocngO,"[Apple, Cryptocoins, TV/Movies Related - Enter..."
3,1637818068549226497,2023-03-20,@Ryohei247 I know! My favorite of the Zord Asc...,"[B2B, Hasbro, Products - B2B]"
4,1637817990401114113,2023-03-20,We also dropped this on the vlog today #marvel...,"[Entertainment franchises, Entertainment, Hasb..."


In [5]:
df['text'] = df['text'].map(lambda x: clean_text(x))

In [6]:
df_clean = clean_tweets_df(df=df, text_col='text', n=2)
df_clean

Unnamed: 0,id,created_at,entities,full_text
0,1637818273336270849,2023-03-20,"[AMD, B2B, Products - B2B]",\nTWEET:\nAMD Allegedly Testing Hybrid Process...
1,1637818157493809152,2023-03-20,"[B2B, Hasbro, Products - B2B]",\nTWEET:\nGold Box Deal of the Day: Up to 50% ...
3,1637818068549226497,2023-03-20,"[B2B, Hasbro, Products - B2B]",\nTWEET:\n I know! My favorite of the Zord Asc...
4,1637817990401114113,2023-03-20,"[Entertainment franchises, Entertainment, Hasb...",\nTWEET:\nWe also dropped this on the vlog tod...
5,1637817988828090370,2023-03-20,"[Services - B2B, Xerox, Visual arts, B2B, Art,...","\nTWEET:\nBarbara T. Smith | Xerox, Coffin, Di..."
...,...,...,...,...
4995,1636796784566468612,2023-03-17,"[AMD, B2B, Products - B2B]","\nTWEET:\nMon, Wed, Fri - he's an intel shill...."
4996,1636796545327562756,2023-03-17,"[AMD, B2B, Intel, Products - B2B]",\nTWEET:\nintel's better memory management is ...
4997,1636796528411959297,2023-03-17,"[AMD, B2B, Intel, Products - B2B]","\nTWEET:\nIntel Processor are kings,... AMD ar..."
4998,1636796364163264512,2023-03-17,"[B2B, Hasbro, Products - B2B]",\nTWEET:\n It’s definitely a stylistic choice ...


In [None]:
ent = df['entities'].to_list()

In [None]:
from itertools import chain

In [None]:
flatten_ent = list(chain.from_iterable(ent))
all_ents = list(set(flatten_ent))

In [7]:
def save_text(df, col, out_dir, name):
    all_text = ' '.join(df[col])
    
    with open(f'{out_dir}/{name}.txt', 'w') as f:
        f.write(all_text)
    
    return

In [8]:
save_text(df=df_clean, col='full_text', out_dir='raw_data', name='test')

## Reddit

In [None]:
reddit_bot = RedditBot(client_id=config.REDDIT_APP_ID,
                       client_secret=config.REDDIT_SECRET,
                       redirect_url=config.REDDIT_REDIRECT_URL,
                       user_agent=config.REDDIT_USER_AGENT)

In [None]:
d = {}
for sub in reddit_bot.client.subreddit('MachineLearning').top(time_filter='all', limit=50):
    d['id'] = sub.id
    d['']

In [None]:
t = []
for post in reddit_bot.client.subreddit('TalesFromTheCustomer').top(time_filter='all', limit=50):
    d = {}
    d['id'] = post.id
    d['text'] = post.title + '\n' + post.selftext
    t.append(d)

In [None]:
comments = []
for post in t:
    post_id = post['id']
    
    submission = reddit_bot.client.submission(post_id)
    
    submission.comments.replace_more(limit=None)
    for comment in submission.comments.list():
        comments.append(comment)
    break

In [None]:
from datetime import datetime

In [None]:
str(datetime.now().day)