### Collecting the data using Pushift API

We have used pmaw wrapper in python to collect the data with Pushift API

In [None]:
import time
import pandas as pd
import datetime as dt
from pmaw import PushshiftAPI

api = PushshiftAPI()

In [None]:
pd.set_option("display.max_rows", None, "display.max_columns", None)

First we have collected the posts for the last month to identify the most engaginf subreddits (using MapReduce) and then based on these subreddits we have collected the comments dataset.

In [None]:
before = int(dt.datetime(2021,4,1,0,0).timestamp())
after = int(dt.datetime(2021,3,1,0,0).timestamp())

In [None]:
test_df = pd.DataFrame()
res = []

for subreddit in unique_subreddits:
    res.append(subreddit)
    print('Number of visited Subreddits is', len(res))
    submissions = api.search_submissions(subreddit=subreddit, limit=None, before=before, after=after)
    print(f'Retrieved {len(submissions)} comments from Pushshift')
    submissions_df = pd.DataFrame(submissions)
    test_df = test_df.append(submissions_df)
print(f'Shape of Final DF is: {test_df.shape}')

In [None]:
test_df.to_csv('./submissions.csv', header=True, index=False, columns=list(test_df.axes[1]))

Due to the large size of submissions.csv (5GB) and the original dataset from Kaggle (15GB) we were using pandas read in chuncks function to analyze the structure of both datasets.  

In [None]:
for chunk in pd.read_csv('submissions.csv', chunksize=10):
    # do things with chunk
    chunk.to_csv('./part.csv', header=True, index=False, columns=list(chunk.axes[1]))

The list of identified top engaging subreddits is

['AskReddit', 'wallstreetbets', 'teenagers', 'news', 'funny', 'relationship_advice', 'unpopularopinion' 'CryptoCurrency', 'pokemon', 'aww'] 


For each of these subreddits we have collected the comments for the last 6 months. Then we shuffled the dataframe, so that in future while splitting into training and testing sets to have equal distribution of each subreddit in both sets.

In [None]:
before = int(dt.datetime(2021,4,1,0,0).timestamp())
after = int(dt.datetime(2020,10,1,0,0).timestamp())

In [None]:
subreddit = 'AskReddit'

comments = api.search_comments(subreddit=subreddit, limit=limit, before=before, after=after)
print(f'Retrieved {len(comments)} comments from Pushshift')
comments_df = pd.DataFrame(comments)
test_df = comments_df
print(f'Shape of Final DF is: {test_df.shape}')

In [None]:
start_time = time.time()
shuffled = data.sample(frac = 1)
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
shuffled.to_csv('./shuffled_comments.csv', header = True, index = False, columns = list(shuffled.axes[1]))

In [None]:
import time
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
start_time = time.time()
data = pd.read_csv('shuffled_comments.csv')
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
def Preprocessing(data):
    
    data = data.drop(['all_awardings', 'associated_award', 'author_flair_background_color', 'author_flair_css_class',
                     'author_flair_richtext', 'author_flair_template_id', 'author_flair_text', 'author_flair_text_color',
                     'author_flair_type', 'author_fullname', 'author_patreon_flair', 'awarders', 
                     'collapsed_because_crowd_control', 'comment_type', 'gildings', 'is_submitter', 'locked',
                     'permalink', 'retrieved_on', 'send_replies', 'subreddit_id', 'top_awarded_type', 'total_awards_received', 
                     'treatment_tags', 'author_cakeday', 'distinguished', 'edited', 'media_metadata'], axis=1)
    data = data.dropna()
    
    data['created_loc_time'] = pd.to_datetime(data['created_utc'], unit='s')
    
    data['hour'] = [d.hour for d in data['created_loc_time']]
    data['dayofweek'] = [d.isoweekday() for d in data['created_loc_time']]
    data['day'] = [d.day for d in data['created_loc_time']]
    data['month'] = [d.month for d in data['created_loc_time']]
    data['year'] = [d.year for d in data['created_loc_time']]
    
    sid = SentimentIntensityAnalyzer()
    
    def sentiment_class_definition(comment):
    
        metrics = {}
        def remove_int(text):
            return ''.join([str(i) for i in text])
    
        ss = sid.polarity_scores(remove_int(comment))
        for k in sorted(ss):
            metrics[k] = ss[k]
            # Divides the Body into Sentiment Classes : {1: 'HP', 2: 'MP', 3: 'N', 4: 'MN', 5: 'HN'}
            if(metrics['compound'] > 0.6):
                sentiment_class = 1
            elif(metrics['compound'] > 0.25):
                sentiment_class = 2
            elif(metrics['compound'] > -0.25):
                sentiment_class = 3
            elif(metrics['compound'] > -0.6):
                sentiment_class = 4
            else:
                sentiment_class = 5
        return sentiment_class

    lis = []
    for index, row in data.iterrows():
        lis.append(sentiment_class_definition(row['body']))
        data['sentiment_class'] = pd.DataFrame(lis)
        
    data = data.drop('body', axis=1)
    
    return data

In [None]:
start_time = time.time()
Preprocessing(data)
print("--- %s seconds ---" % (time.time() - start_time))