In [1]:
import numpy as np
import pandas as pd
import re
import os 

import praw
import emoji
from concurrent.futures import ThreadPoolExecutor, as_completed

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
COMMON_WORDS = set(stopwords.words("english"))

In [3]:
class Get_subreddit():
    def __init__(self, subreddit):
        reddit = praw.Reddit(
            client_id= os.environ['CLIENT_ID_REDDIT'], 
            client_secret= os.environ['CLIENT_SECRET_REDDIT'], 
            user_agent="hsabbar",
        )
        self.hot = reddit.subreddit(subreddit).hot(limit=500)
        self.max_comments = 100
        self.data = []
        self.columns = ['title','id','score', 'date', 'author', 'content', 'num_comments', 'comments'] 
        
    def get_data(self, submission):
        data_tmp = []
        comment_data = []
        data_tmp = [submission.title, submission.id, submission.score,submission.created_utc, submission.author, 
                     submission.selftext, len(submission.comments)]

        submission.comments.replace_more(limit=self.max_comments)
        comment_data = [top_level_comment.body for top_level_comment in submission.comments[:self.max_comments]]
        

        """
        for top_level_comment in submission.comments[:self.max_comments]: 
            comment_data.append(top_level_comment.body)
        if len(comment_data) < self.max_comments:
            comment_data = comment_data + [None] * (self.max_comments - len(comment_data))
        """
        return data_tmp + [' '.join(comment_data)]
    
    def get_data_multi_thread(self):
        processes = []
        with ThreadPoolExecutor(max_workers=62) as executor:
            for submission in self.hot:
                processes.append(executor.submit(self.get_data, submission))

        self.data = [task.result() for task in as_completed(processes)]
        
        #print(self.data)
        return pd.DataFrame(self.data, columns = self.columns)

In [80]:
%%time
reddit_data = Get_subreddit("Bitcoin")
df = reddit_data.get_data_multi_thread()
df.to_csv("subReddit-data-Bitcion.csv", sep=';', encoding='utf-8')
#df.head()

Wall time: 3.01 s


In [4]:
df2 = pd.read_csv("subReddit-data-Bitcion.csv", sep=';')
df2.head()

Unnamed: 0.1,Unnamed: 0,title,id,score,date,author,content,num_comments,comments
0,0,Best hardware wallet?,nla5z5,16,1622011000.0,Negative_Comedian870,Hey guys. \n\nSeeing lots of people on the Led...,16,"I personaly use trezor, but i think the only p..."
1,1,"IT Giant Globant Joins Buys $500,000 Worth of ...",nkmhp2,36,1621939000.0,VAMPXIII,,10,People here acting like 10 BTC is a small quan...
2,2,Cryptocurrency Had a Dire Weekend. Why?,nl3nd2,1,1621988000.0,SnooSketches3367,,1,Novagrass ? talked about 5 factors on utube's ...
3,3,Wash-sale regulation doesn't apply to Bitcoin ...,nlra50,1,1622063000.0,Wishy_washy_Though,"""Bitcoin crash opens door to a tax loophole fo...",1,"Bob buys 1 BTC at $60K. Later, Bob sells 1 BTC..."
4,4,Bitcoin Surges Over $40k After Bitcoin Mining ...,nlkf4q,0,1622045000.0,crypto_pub,,5,Yea no Bitcoin would test 40k no matter the la...


In [5]:
class Data_processing():
    def __init__(self, df):
        self.df = df
        self.columns_text = ['title', 'content', 'comments'] #+ [f'comment{i}' for i in range(1, 101)]
        self.del_url = re.compile(r'https(.*?)(\s|$)')
    
    
    def my_stopwords(self, text):
        text_tokens = word_tokenize(text)
        data = [word for word in text_tokens if word not in COMMON_WORDS and len(word) > 1]
        return " ".join(data)

    
        
    def nlp_pipeline(self, text):
        if text is not None :
            
            text = str(text).lower()
            text = text.replace('\n', ' ').replace('\r', '')
            text = ' '.join(text.split())
            text = re.sub(r"[A-Za-z\.]*[0-9]+[A-Za-z%°\.]*", "", text)
            text = re.sub(r"(\s\-\s|-$)", "", text)
            text = re.sub(r"\&\S*\s", " ", text)
            text = re.sub(r"['&@¨=“”’‘+—^.;:-_~!-/\{\}*=\%?,#<>\"()\[\]]", "", text)
            
            for url in self.del_url.findall(text):
                text = re.sub(f'https{url[0]}', "", text)
            text = emoji.get_emoji_regexp().sub("", text) 
            
            text = self.my_stopwords(text)
            return text
        else :
            pass
    
    def clean_data(self):
        for col_txt in self.columns_text :
            #print(col_txt)
            self.df[col_txt] = self.df[col_txt].apply(lambda x:  self.nlp_pipeline(x))
        
        return self.df

In [6]:
%%time
data_proc = Data_processing(df2)
df2 = data_proc.clean_data()
df2.head()

Wall time: 3.91 s


Unnamed: 0.1,Unnamed: 0,title,id,score,date,author,content,num_comments,comments
0,0,best hardware wallet,nla5z5,16,1622011000.0,Negative_Comedian870,hey guys seeing lots people ledger sub btc emp...,16,personaly use trezor think problem ledger late...
1,1,giant globant joins buys worth bitcoin,nkmhp2,36,1621939000.0,VAMPXIII,,10,people acting like btc small quantity see comp...
2,2,cryptocurrency dire weekend,nl3nd2,1,1621988000.0,SnooSketches3367,,1,novagrass talked factors utubes kitco
3,3,washsale regulation doesnt apply bitcoin crypt...,nlra50,1,1622063000.0,Wishy_washy_Though,bitcoin crash opens door tax loophole investors,1,bob buys btc later bob sells btc happened capi...
4,4,bitcoin surges bitcoin mining council launch,nlkf4q,0,1622045000.0,crypto_pub,,5,yea bitcoin would test matter launch stupid co...
