-
Notifications
You must be signed in to change notification settings - Fork 0
/
Reddit wallstreetbets
134 lines (99 loc) · 3.3 KB
/
Reddit wallstreetbets
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
!pip install praw
import praw
import pandas as pd
reddit_read_only = praw.Reddit(client_id="",
client_secret="",
user_agent="")
subreddit = reddit_read_only.subreddit('wallstreetbets')
list1 = []
list2 = []
list3 = []
list4 = []
for post in subreddit.top('day'):
words = post.title.split()
cashtags = list(set(filter(lambda word: word.lower().startswith('$'), words)))
if len(cashtags) > 0:
list1.append(post.title)
list2.append(post.url)
list3.append(post.score)
list4.append(post.num_comments)
df = pd.DataFrame(list(zip(list1,list2,list3,list4)),
columns = ['title','url','score','total comments'])
df
df.to_csv('top_posts.csv')
df.url[0]
url = df.url[0]
submission = reddit_read_only.submission(url=url)
from praw.models import MoreComments
post_comments = []
for comments in submission.comments:
if type(comments) == MoreComments:
continue
post_comments.append(comments.body)
comments_df = pd.DataFrame(post_comments, columns=['comment'])
comments_df.to_csv('comment_01.csv')
import nltk
nltk.download()
nltk.download([
"stopwords",
"state_union",
"twitter_samples",
"movie_reviews",
"vader_lexicon",
])
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
#enter the csv file of the comment to analyze
df = pd.read_csv("comment_01.csv")
df.drop(index=df.index[0], axis=0, inplace=True)
# df = df_old.iloc[1:]
df.head()
def get_tokens(doc):
# case normalization
doc = doc.lower()
# tokenization
tokens = nltk.word_tokenize(doc)
# non-word token removal
tokens2 = [token for token in tokens if token.isalpha()]
# stopwords removal
stopwords = nltk.corpus.stopwords.words("english")
tokens3 = [token for token in tokens2 if token not in stopwords]
# word stemming
porter = nltk.PorterStemmer()
tokens4 = [porter.stem(token) for token in tokens3]
return tokens4
df['comment_tokens'] = df["comment"].apply(get_tokens)
all_comment_tokens = []
for lst in df["comment_tokens"].tolist():
all_comment_tokens += lst
freq_comment = nltk.FreqDist(all_comment_tokens)
#sort the frequency list in descending order
sorted_freq_comment = sorted(freq_comment.items(),key = lambda k:k[1], reverse = True)
sorted_freq_comment
freq_comment
freq_comment.plot(30)
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline
def gen_wordcloud(all_tokens):
words = " ".join(all_tokens)
wordcloud = WordCloud(width = 800, height = 800,
background_color ='white',
min_font_size = 10).generate(words)
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()
gen_wordcloud(all_comment_tokens)
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
def gen_sentiment(doc):
score = sia.polarity_scores(doc)["compound"]
return score
df['summary_comment_score'] = df["comment"].apply(gen_sentiment)
pos_text_df = df[df["summary_comment_score"]>0]
neg_text_df = df[df["summary_comment_score"]<=0]
print(len(pos_text_df))
print(len(neg_text_df))