In [1]:
import pandas as pd
import numpy as np
# misc
import datetime as dt
from pprint import pprint
from itertools import chain
import redditcleaner
import re

# sentiment analysis
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize, RegexpTokenizer # tokenize words
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()
# visualization
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams["figure.figsize"] = (10, 8) # default plot size
import seaborn as sns
sns.set(style='whitegrid', palette='Dark2')
from wordcloud import WordCloud
nltk.download('vader_lexicon') # get lexicons data
nltk.download('punkt') # for tokenizer
nltk.download('stopwords')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\rameshbabum\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rameshbabum\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rameshbabum\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
Data = pd.read_csv("Ethereum_with_dates_test.csv")

In [3]:
Data.drop(Data.index[Data['selftext'] == '[removed]'], inplace = True)

In [5]:
Ethereum_duplicates_removed = Data.drop_duplicates(subset=['selftext'],keep='first')

In [6]:
Ethereum_duplicates_removed.reset_index()

Unnamed: 0,index,created_utc,title,selftext,Date
0,0,1.637880e+09,Ethereum Facebook Group acquired - How is that...,I'm a member of the largest public Ethereum gr...,2021-11-25
1,1,1.637880e+09,CRO erc20 is safe?,"Hello i have some CRO on an hardware wallet, b...",2021-11-25
2,2,1.637878e+09,Write off question!,"Hi everyone, \n\n\nI'm a sole proprietor in H...",2021-11-25
3,3,1.637878e+09,MOONX | Low Cap |3m MC | Utility coin for Mark...,🔥MOONX\n\n&amp;#x200B;\n\nMoonX is a Utility T...,2021-11-25
4,4,1.637878e+09,Macy’s launching Polygon powered commemorative...,Macy’s dropped their first ever NFT collection...,2021-11-25
...,...,...,...,...,...
27763,27763,1.619821e+09,Market Cap For Crytpos?,I've been trying to get my head around how hig...,2021-04-30
27764,27764,1.619820e+09,Reasons why Cardano is more valuable than prot...,Cardano has the biggest market capitalization ...,2021-04-30
27765,27765,1.619820e+09,Ethereum or Bitcoin?,Hello! I’m relatively new to the cryptocurrenc...,2021-04-30
27766,27766,1.619820e+09,Crypto for Beginners - What Everyone Should Know,Hey people of the /r/CC subreddit:\n\nSo I dec...,2021-04-30


In [7]:
pd.options.mode.chained_assignment = None

In [8]:
Ethereum_duplicates_removed.dropna(subset=['selftext'], inplace=True)

In [10]:
Ethereum_duplicates_removed.dropna(subset=['created_utc'], inplace=True)

In [12]:
Ethereum = Ethereum_duplicates_removed

In [13]:
Ethereum

Unnamed: 0,created_utc,title,selftext,Date
0,1.637880e+09,Ethereum Facebook Group acquired - How is that...,I'm a member of the largest public Ethereum gr...,2021-11-25
1,1.637880e+09,CRO erc20 is safe?,"Hello i have some CRO on an hardware wallet, b...",2021-11-25
2,1.637878e+09,Write off question!,"Hi everyone, \n\n\nI'm a sole proprietor in H...",2021-11-25
3,1.637878e+09,MOONX | Low Cap |3m MC | Utility coin for Mark...,🔥MOONX\n\n&amp;#x200B;\n\nMoonX is a Utility T...,2021-11-25
4,1.637878e+09,Macy’s launching Polygon powered commemorative...,Macy’s dropped their first ever NFT collection...,2021-11-25
...,...,...,...,...
27763,1.619821e+09,Market Cap For Crytpos?,I've been trying to get my head around how hig...,2021-04-30
27764,1.619820e+09,Reasons why Cardano is more valuable than prot...,Cardano has the biggest market capitalization ...,2021-04-30
27765,1.619820e+09,Ethereum or Bitcoin?,Hello! I’m relatively new to the cryptocurrenc...,2021-04-30
27766,1.619820e+09,Crypto for Beginners - What Everyone Should Know,Hey people of the /r/CC subreddit:\n\nSo I dec...,2021-04-30


In [14]:
Ethereum['selftext']=Ethereum['selftext'].apply(str)
Ethereum['selftext'] = Ethereum['selftext'].map(redditcleaner.clean)

In [15]:
def clean(text):
    text = re.sub(r'@[A-Za-z0-9]+','',text)
    text = re.sub(r'#','',text)
    text = re.sub(r'RT[\s]+','',text)
    text = re.sub(r'https?:\/\/\S+','',text)
    text = re.sub('[!,*)@#%(&$_?.^=:\/]', '', text)
    return text 

In [16]:
Ethereum['selftext'] = Ethereum['selftext'].apply(clean)

In [17]:
def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U0001FA70-\U0001FAFF"
        u"\U0001F900-\U0001F9FF"
        u"\U00002702-\U000027B0"
        u"\U00002700-\U000027BF"
        u"\U00002600-\U000026FF"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

In [18]:
Ethereum['selftext'] = Ethereum['selftext'].apply(remove_emojis)

In [19]:
Ethereum.reset_index()

Unnamed: 0,index,created_utc,title,selftext,Date
0,0,1.637880e+09,Ethereum Facebook Group acquired - How is that...,I'm a member of the largest public Ethereum gr...,2021-11-25
1,1,1.637880e+09,CRO erc20 is safe?,Hello i have some CRO on an hardware wallet bu...,2021-11-25
2,2,1.637878e+09,Write off question!,Hi everyone I'm a sole proprietor in Hawaii US...,2021-11-25
3,3,1.637878e+09,MOONX | Low Cap |3m MC | Utility coin for Mark...,MOONX MoonX is a Utility Token that will revi...,2021-11-25
4,4,1.637878e+09,Macy’s launching Polygon powered commemorative...,Macy’s dropped their first ever NFT collection...,2021-11-25
...,...,...,...,...,...
27763,27763,1.619821e+09,Market Cap For Crytpos?,I've been trying to get my head around how hig...,2021-04-30
27764,27764,1.619820e+09,Reasons why Cardano is more valuable than prot...,Cardano has the biggest market capitalization ...,2021-04-30
27765,27765,1.619820e+09,Ethereum or Bitcoin?,Hello I’m relatively new to the cryptocurrency...,2021-04-30
27766,27766,1.619820e+09,Crypto for Beginners - What Everyone Should Know,Hey people of the rCC subreddit So I decided I...,2021-04-30


In [21]:
def preprocess(data):    
    # convert to lowercase
    text = data.lower()
    # remove non letters
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    # tokenize
    words = text.split()
    # remove stopwords
    words = [w for w in words if w not in stopwords.words("english")]
    # apply stemming
    words = [PorterStemmer().stem(w) for w in words]
    stemmed_sentence = ' '.join(words)
    # return list
    return stemmed_sentence


In [22]:
Ethereum['cleaned_discription'] = Ethereum['selftext'].apply(preprocess)

In [30]:
Ethereum = Ethereum.drop_duplicates(subset=['cleaned_discription'],keep='first')

In [32]:
Ethereum.dropna(subset=['cleaned_discription'], inplace=True)

In [35]:
Ethereum.reset_index()

Unnamed: 0,index,created_utc,title,selftext,Date,cleaned_discription
0,0,1.637880e+09,Ethereum Facebook Group acquired - How is that...,I'm a member of the largest public Ethereum gr...,2021-11-25,member largest public ethereum group facebook ...
1,1,1.637880e+09,CRO erc20 is safe?,Hello i have some CRO on an hardware wallet bu...,2021-11-25,hello cro hardwar wallet erc20 cro ethereum to...
2,2,1.637878e+09,Write off question!,Hi everyone I'm a sole proprietor in Hawaii US...,2021-11-25,hi everyon sole proprietor hawaii usa year une...
3,3,1.637878e+09,MOONX | Low Cap |3m MC | Utility coin for Mark...,MOONX MoonX is a Utility Token that will revi...,2021-11-25,moonx moonx util token revitalis bsc space bri...
4,4,1.637878e+09,Macy’s launching Polygon powered commemorative...,Macy’s dropped their first ever NFT collection...,2021-11-25,maci drop first ever nft collect consist 10 un...
...,...,...,...,...,...,...
25630,27762,1.619823e+09,Exposing reddit scammers and gaining knowledge!,uOfficialModerator0 and ucoinbasesupport dm'ed...,2021-04-30,uofficialmoderator0 ucoinbasesupport dm ed say...
25631,27763,1.619821e+09,Market Cap For Crytpos?,I've been trying to get my head around how hig...,2021-04-30,tri get head around high good boi might go loo...
25632,27765,1.619820e+09,Ethereum or Bitcoin?,Hello I’m relatively new to the cryptocurrency...,2021-04-30,hello rel new cryptocurr world invest smaller ...
25633,27766,1.619820e+09,Crypto for Beginners - What Everyone Should Know,Hey people of the rCC subreddit So I decided I...,2021-04-30,hey peopl rcc subreddit decid make post due hu...


In [29]:
Ethereum

11338

In [33]:
from datetime import datetime

In [34]:
def convertdate(ts):
    return datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')

In [35]:
Ethereum['Date'] = Ethereum['created_utc'].apply(convertdate)

In [38]:
Ethereum.to_csv('test_for_dates.csv')