In [1]:
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [17]:
# import file
stock = pd.read_csv('file/stockmarket.csv')
crypto = pd.read_csv('file/crypto.csv')

### StockMarket subreddits

In [3]:
stock = stock[['title','subreddit']]

In [4]:
def tokenize(words):
    word = RegexpTokenizer(pattern=r'\w+').tokenize(words)
    return word

In [5]:
# Tokenize title text
stock['token'] = stock['title'].map(tokenize)
# Turn list into strs
stock['token'] = stock['token'].apply(', '.join)

In [7]:
stock

Unnamed: 0,title,subreddit,token
0,I unveil the simple strategy that allows me an...,StockMarket,"I, unveil, the, simple, strategy, that, allows..."
1,OPT up today,StockMarket,"OPT, up, today"
2,Top 5 Crypto and how they work?,StockMarket,"Top, 5, Crypto, and, how, they, work"
3,Citius: Halt For Superiority On The Horizon (N...,StockMarket,"Citius, Halt, For, Superiority, On, The, Horiz..."
4,Testing,StockMarket,Testing
...,...,...,...
995,New Ape HODLR,StockMarket,"New, Ape, HODLR"
996,Karma,StockMarket,Karma
997,DONT LET THEM SHORT THIS MORGAGE REIT #IVR GRE...,StockMarket,"DONT, LET, THEM, SHORT, THIS, MORGAGE, REIT, I..."
998,$HITI es empresa de FUTURO!,StockMarket,"HITI, es, empresa, de, FUTURO"


In [8]:
# Instantiate sentimen analyzer
sia = SentimentIntensityAnalyzer()

In [9]:
# Generate polarity dictionaries
sia_dicts = [sia.polarity_scores(token) for token in stock['token']]

# Cast dictionaries to dataframe
sia_df = pd.DataFrame(sia_dicts)

# reset index
stock = stock.reset_index(drop = True)

# concat title and score
df_stock = pd.concat([stock, sia_df], axis = 1)

In [10]:
df_stock[['compound']].describe()

Unnamed: 0,compound
count,1000.0
mean,0.120793
std,0.336119
min,-0.8442
25%,0.0
50%,0.0
75%,0.361375
max,0.9757


In [11]:
def polar(num):
    if num > 0.05:
        return 'Positive'
    elif num < -0.05:
        return 'Negative'
    else:
        return 'Neutral'

In [12]:
df_stock['polar'] = [polar(i) for i in df_stock['compound']]

In [13]:
df_stock

Unnamed: 0,title,subreddit,token,neg,neu,pos,compound,polar
0,I unveil the simple strategy that allows me an...,StockMarket,"I, unveil, the, simple, strategy, that, allows...",0.000,0.804,0.196,0.7456,Positive
1,OPT up today,StockMarket,"OPT, up, today",0.000,1.000,0.000,0.0000,Neutral
2,Top 5 Crypto and how they work?,StockMarket,"Top, 5, Crypto, and, how, they, work",0.000,0.769,0.231,0.2023,Positive
3,Citius: Halt For Superiority On The Horizon (N...,StockMarket,"Citius, Halt, For, Superiority, On, The, Horiz...",0.000,0.769,0.231,0.3400,Positive
4,Testing,StockMarket,Testing,0.000,1.000,0.000,0.0000,Neutral
...,...,...,...,...,...,...,...,...
995,New Ape HODLR,StockMarket,"New, Ape, HODLR",0.000,1.000,0.000,0.0000,Neutral
996,Karma,StockMarket,Karma,0.000,1.000,0.000,0.0000,Neutral
997,DONT LET THEM SHORT THIS MORGAGE REIT #IVR GRE...,StockMarket,"DONT, LET, THEM, SHORT, THIS, MORGAGE, REIT, I...",0.000,0.687,0.313,0.6249,Positive
998,$HITI es empresa de FUTURO!,StockMarket,"HITI, es, empresa, de, FUTURO",0.000,1.000,0.000,0.0000,Neutral


In [15]:
df_stock['polar'].value_counts(normalize=True)

Neutral     0.521
Positive    0.351
Negative    0.128
Name: polar, dtype: float64

### CryptoCurrency subreddits

In [18]:
# build a dataframe with title and subreddit
crypto = crypto[['title','subreddit']]

In [19]:
# Tokenize title text
crypto['token'] = crypto['title'].map(tokenize)
# Turn list into strs
crypto['token'] = crypto['token'].apply(', '.join)

In [21]:
# Generate polarity dictionaries
sia_dicts = [sia.polarity_scores(token) for token in crypto['token']]

# Cast dictionaries to dataframe
sia_df = pd.DataFrame(sia_dicts)

# reset index
stock = stock.reset_index(drop = True)

# concat title and score
df_crypto = pd.concat([crypto, sia_df], axis = 1)

In [23]:
df_stock[['compound']].describe()

Unnamed: 0,compound
count,1000.0
mean,0.082168
std,0.308086
min,-0.9801
25%,0.0
50%,0.0
75%,0.2732
max,0.9403


In [24]:
df_crypto['polar'] = [polar(i) for i in df_crypto['compound']]

In [25]:
df_crypto['polar'].value_counts(normalize=True)

Neutral     0.550
Positive    0.314
Negative    0.136
Name: polar, dtype: float64

### The proportion of positive and negative posts are highly similar. Both subreddits may have very similar user groups and users share very similar thought for writing titles. 