In [1]:
%load_ext blackcellmagic

In [2]:
#!~/environs/env1/bin/python

import pandas as pd
import numpy as np
from psaw import PushshiftAPI
from datetime import datetime
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
from preprocessors.twitter_preprocessor import *
from nltk.sentiment import SentimentIntensityAnalyzer

In [3]:
# get reddit API
api = PushshiftAPI()

In [5]:
CURRENT_TIME = pd.Timestamp.now()
START_TIME = CURRENT_TIME - pd.Timedelta(days=90)
END_TIME = CURRENT_TIME
print(START_TIME)
print(END_TIME)

2022-02-13 11:17:26.727292
2022-05-14 11:17:26.727292


In [6]:
results=api.search_submissions(after=int(START_TIME.timestamp()), before=int(END_TIME.timestamp()), subreddit="news", limit=100000, filter=["id", "url", "author", "title", "subreddit", "selftext"])

In [7]:
submissions = [x.d_ for x in results]
len(submissions)

63744

In [8]:
data_dict = {"id": list(), "author": list(), "full_text": list(), "created": list()}

for submission in submissions:
    if submission["id"] and submission["author"] and submission["title"] and submission["created"]:
        data_dict["id"].append(str(submission["id"]))
        data_dict["author"].append(str(submission["author"]))
        data_dict["full_text"].append(str(submission["title"]))
        data_dict["created"].append(int(submission["created"]))

In [9]:
df = pd.DataFrame(data_dict)
print(df.shape)
df.head(10)

(63744, 4)


Unnamed: 0,id,author,full_text,created
0,updh9o,LutaLiChi_91,Thiết kế nội thất văn phòng hiện đại thông min...,1652512592
1,updfpr,khabarinshorts,Doctor Strange 2 Box Office: Film surpasses Th...,1652512394
2,updfc3,NeedleworkerOnly7266,Free Fire New Patch Update event with Free Mag...,1652512343
3,updedk,solovivo,Buy furniture dressers online at best price,1652512219
4,upddg8,Health-99,NEET PG Admit Card 2022 Released|how to downlo...,1652512101
5,updddm,AdditionStriking6755,Animal behaviour knowledgeable Karen Wild disc...,1652512091
6,updcqc,LutaLiChi_91,Các loại lá thuốc xông hơi giải cảm có ngay tạ...,1652512009
7,updbvo,solovivo,Buy garden furniture online at affordable price,1652511899
8,updaxs,The_Dispatch,PHOTO STORY: Platinum Jubilee Decoration in Lo...,1652511776
9,upda6w,solovivo,Buy dining tables online in Switzerland,1652511685


In [10]:
preprocessed_text_lemmatized = df["full_text"].progress_apply(lambda x: TwitterPreprocessor().preprocess_tweet(x, lemmatize=True))

100%|██████████| 63744/63744 [06:26<00:00, 164.74it/s]


In [11]:
preprocessed_text = df["full_text"].progress_apply(lambda x: TwitterPreprocessor().preprocess_tweet(x, lemmatize=False))

100%|██████████| 63744/63744 [04:16<00:00, 248.91it/s]


In [13]:
df.insert(3, "prep_text", preprocessed_text)
df.insert(4, "prep_text_lemmatized", preprocessed_text_lemmatized)
df.head(10)

Unnamed: 0,id,author,full_text,prep_text,prep_text_lemmatized,created
0,updh9o,LutaLiChi_91,Thiết kế nội thất văn phòng hiện đại thông min...,minh chi,minh chi,1652512592
1,updfpr,khabarinshorts,Doctor Strange 2 Box Office: Film surpasses Th...,doctor strange 2 box office film surpasses the...,doctor strange 2 box office film surpasses the...,1652512394
2,updfc3,NeedleworkerOnly7266,Free Fire New Patch Update event with Free Mag...,free fire new patch update event with free mag...,free fire new patch update event with free mag...,1652512343
3,updedk,solovivo,Buy furniture dressers online at best price,buy furniture dressers online at best price,buy furniture dresser online at best price,1652512219
4,upddg8,Health-99,NEET PG Admit Card 2022 Released|how to downlo...,neet pg admit card 2022 to download neet pg ad...,neet pg admit card 2022 to download neet pg ad...,1652512101
5,updddm,AdditionStriking6755,Animal behaviour knowledgeable Karen Wild disc...,animal behaviour knowledgeable karen wild disc...,animal behaviour knowledgeable karen wild disc...,1652512091
6,updcqc,LutaLiChi_91,Các loại lá thuốc xông hơi giải cảm có ngay tạ...,ngay,ngay,1652512009
7,updbvo,solovivo,Buy garden furniture online at affordable price,buy garden furniture online at affordable price,buy garden furniture online at affordable price,1652511899
8,updaxs,The_Dispatch,PHOTO STORY: Platinum Jubilee Decoration in Lo...,photo story platinum jubilee decoration in london,photo story platinum jubilee decoration in london,1652511776
9,upda6w,solovivo,Buy dining tables online in Switzerland,buy dining tables online in switzerland,buy din table online in switzerland,1652511685


In [14]:
t1 = df["created"].min()
t2 = df["created"].max()
print(t1, t2)

1644743887 1652512592


In [22]:
# helper functions
def to_datetime(unix: int):
    return str(datetime.utcfromtimestamp(unix).strftime("%d-%m-%Y %H:%M:%S"))

In [21]:
df["created"] = df["created"].progress_apply(lambda x: to_datetime(x))

100%|██████████| 63744/63744 [00:00<00:00, 94251.16it/s] 


In [28]:
sia = SentimentIntensityAnalyzer()
sentiment_polarity = df["prep_text_lemmatized"].progress_apply(lambda x: sia.polarity_scores(x)["compound"])
df.insert(5, "polarity", sentiment_polarity)

100%|██████████| 63744/63744 [00:10<00:00, 5884.09it/s]


In [29]:
display(df.shape)
df.head(10)

(63744, 7)

Unnamed: 0,id,author,full_text,prep_text,prep_text_lemmatized,polarity,created
0,updh9o,LutaLiChi_91,Thiết kế nội thất văn phòng hiện đại thông min...,minh chi,minh chi,0.0,14-05-2022 07:16:32
1,updfpr,khabarinshorts,Doctor Strange 2 Box Office: Film surpasses Th...,doctor strange 2 box office film surpasses the...,doctor strange 2 box office film surpasses the...,-0.2732,14-05-2022 07:13:14
2,updfc3,NeedleworkerOnly7266,Free Fire New Patch Update event with Free Mag...,free fire new patch update event with free mag...,free fire new patch update event with free mag...,0.6369,14-05-2022 07:12:23
3,updedk,solovivo,Buy furniture dressers online at best price,buy furniture dressers online at best price,buy furniture dresser online at best price,0.6369,14-05-2022 07:10:19
4,upddg8,Health-99,NEET PG Admit Card 2022 Released|how to downlo...,neet pg admit card 2022 to download neet pg ad...,neet pg admit card 2022 to download neet pg ad...,0.3818,14-05-2022 07:08:21
5,updddm,AdditionStriking6755,Animal behaviour knowledgeable Karen Wild disc...,animal behaviour knowledgeable karen wild disc...,animal behaviour knowledgeable karen wild disc...,0.3182,14-05-2022 07:08:11
6,updcqc,LutaLiChi_91,Các loại lá thuốc xông hơi giải cảm có ngay tạ...,ngay,ngay,0.0,14-05-2022 07:06:49
7,updbvo,solovivo,Buy garden furniture online at affordable price,buy garden furniture online at affordable price,buy garden furniture online at affordable price,0.0,14-05-2022 07:04:59
8,updaxs,The_Dispatch,PHOTO STORY: Platinum Jubilee Decoration in Lo...,photo story platinum jubilee decoration in london,photo story platinum jubilee decoration in london,0.0,14-05-2022 07:02:56
9,upda6w,solovivo,Buy dining tables online in Switzerland,buy dining tables online in switzerland,buy din table online in switzerland,0.0,14-05-2022 07:01:25


In [52]:
df_positive = df.loc[df["polarity"] > 0.8][["full_text", "prep_text","prep_text_lemmatized"]]
df_positive.drop_duplicates(subset=["full_text", "prep_text", "prep_text_lemmatized"], inplace=True, ignore_index=True)
df_positive["sentiment"]=pd.Series()
df_positive.reset_index(inplace=True, drop=True)
print(df_positive.shape)
df_positive.head()

(693, 4)


Unnamed: 0,full_text,prep_text,prep_text_lemmatized,sentiment
0,Socrates Quote || Quotations || Quotes || Beau...,socrates quote quotations quotes beautiful wor...,socrates quote quotation quote beautiful word ...,
1,Wise for all best online banking for all busin...,wise for all best online banking for all busin...,wise for all best online banking for all busin...,
2,"""Breast is best"" Seyi Shay says as she shares ...",breast is best seyi shay says as she shares br...,breast be best seyi shay say a she share breas...,
3,'Until I met you I thought soulmate was just a...,until i met you i thought soulmate was just a ...,until i met you i thought soulmate be just a m...,
4,NEW SТOCKS АVAILABLE - 1000 CARDS LEFТ! Аlmoѕt...,new 100 cards 13 million premium golden were w...,new 100 card 13 million premium golden be what...,


In [53]:
df_negative = df.loc[df["polarity"] < (-0.85)][["full_text", "prep_text","prep_text_lemmatized"]]
df_negative.drop_duplicates(subset=["full_text", "prep_text", "prep_text_lemmatized"], inplace=True, ignore_index=True)
df_negative["sentiment"]=pd.Series()
df_negative.reset_index(inplace=True, drop=True)
print(df_negative.shape)
df_negative.head()

(1167, 4)


Unnamed: 0,full_text,prep_text,prep_text_lemmatized,sentiment
0,"Building fire kills 27 in New Delhi, police ar...",building fire kills 27 in new delhi police arr...,building fire kill 27 in new delhi police arre...,
1,Ukraine collects Russian dead as war rages on ...,ukraine collects russian dead as war rages on ...,ukraine collect russian dead a war rage on mul...,
2,Germany bans vigil in memory of journalist kil...,germany bans vigil in memory of journalist kil...,germany ban vigil in memory of journalist kill...,
3,"Putin ""very sick"" with cancer, other health is...",putin very sick with cancer other health issue...,putin very sick with cancer other health issue...,
4,Lawsuit claims Bastrop County deputies ‘tortur...,lawsuit claims bastrop county deputies torture...,lawsuit claim bastrop county deputy torture mu...,


In [54]:
df_positive.to_csv("positive_reddit.csv", index=False, quoting=csv.QUOTE_ALL)

In [None]:
df_negative.to_csv("negative_reddit.csv", index=False, quoting=csv.QUOTE_ALL)