In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from pprint import pprint
from datetime import datetime
import collections
import re

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('sentiwordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import stopwords
from nltk import sent_tokenize, word_tokenize, pos_tag

from wordcloud import WordCloud

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/sentiwordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
eng = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/datasets/cryptocurrency_sentiment/tweets_labelled_09042020_16072020.csv', sep=';').set_index('id')
eng.shape

(5000, 3)

In [6]:
eng.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 77522 to 301411
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   created_at  5000 non-null   object
 1   text        5000 non-null   object
 2   sentiment   1300 non-null   object
dtypes: object(3)
memory usage: 156.2+ KB


In [7]:
eng = eng[eng['sentiment'].notnull()]

In [8]:
ticker_pattern = re.compile(r'(^\$[A-Z]+|^\$ES_F)')
ht_pattern = re.compile(r'#\w+')

ticker_dic = collections.defaultdict(int)
ht_dic = collections.defaultdict(int)

for text in eng['text']:
    for word in text.split():
        if ticker_pattern.fullmatch(word) is not None:
            ticker_dic[word[1:]] += 1

            word = word.lower()
            if ht_pattern.fullmatch(word) is not None:
                ht_dic[word] += 1

In [9]:
charonly = re.compile(r'[^a-zA-Z\s]')
handle_pattern = re.compile(r'@\w+')
emoji_pattern = re.compile("["
                        u"\U0001F600-\U0001F64F"  # emoticons
                        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                        u"\U0001F680-\U0001F6FF"  # transport & map symbols
                        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                        u"\U00002702-\U000027B0"
                        u"\U000024C2-\U0001F251"
                        "]+", flags=re.UNICODE)
url_pattern = re.compile(
    'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
pic_pattern = re.compile('pic\.twitter\.com/.{10}')
special_code = re.compile(r'(&amp;|&gt;|&lt;)')
tag_pattern = re.compile(r'<.*?>')

STOPWORDS = set(stopwords.words('english')).union(
    {'rt', 'retweet', 'RT', 'Retweet', 'RETWEET'})

lemmatizer = WordNetLemmatizer()

def hashtag(phrase):
    return ht_pattern.sub(' ', phrase)

def remove_ticker(phrase):
    return ticker_pattern.sub('', phrase)
    
def specialcode(phrase):
    return special_code.sub(' ', phrase)

def emoji(phrase):
    return emoji_pattern.sub(' ', phrase)

def url(phrase):
    return url_pattern.sub('', phrase)

def pic(phrase):
    return pic_pattern.sub('', phrase)

def html_tag(phrase):
    return tag_pattern.sub(' ', phrase)

def handle(phrase):
    return handle_pattern.sub('', phrase)

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    
    # DIS, ticker symbol of Disney, is interpreted as the plural of "DI" 
    # in WordCloud, so I converted it to Disney
    phrase = re.sub('DIS', 'Disney', phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"(he|He)\'s", "he is", phrase)
    phrase = re.sub(r"(she|She)\'s", "she is", phrase)
    phrase = re.sub(r"(it|It)\'s", "it is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"(\'ve|has)", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

def onlychar(phrase):
    return charonly.sub('', phrase)

def remove_stopwords(phrase):
    return " ".join([word for word in str(phrase).split()\
                     if word not in STOPWORDS])

def tokenize_stem(phrase):   
    tokens = word_tokenize(phrase)
    stem_words =[]
    for token in tokens:
        word = lemmatizer.lemmatize(token)
        stem_words.append(word)        
    buf = ' '.join(stem_words)    
    return buf

In [10]:
def arrange_text(ds):
    ds['text2'] = ds['text'].apply(emoji)
    ds['text2'] = ds['text2'].apply(handle)
    ds['text2'] = ds['text2'].apply(specialcode)
    ds['text2'] = ds['text2'].apply(hashtag)
    ds['text2'] = ds['text2'].apply(url)
    ds['text2'] = ds['text2'].apply(pic)
    ds['text2'] = ds['text2'].apply(html_tag)
    ds['text2'] = ds['text2'].apply(onlychar)
    ds['text2'] = ds['text2'].apply(decontracted)
    ds['text2'] = ds['text2'].apply(onlychar)
    ds['text2'] = ds['text2'].apply(tokenize_stem)
    ds['text2'] = ds['text2'].apply(remove_stopwords)

In [11]:
arrange_text(eng)
eng.head()

Unnamed: 0_level_0,created_at,text,sentiment,text2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
77522,2020-04-15 01:03:46+00:00,"RT @RobertBeadles: Yo💥\nEnter to WIN 1,000 Mon...",positive,Yo Enter WIN Monarch Tokens US Stock Market Cr...
661634,2020-06-25 06:20:06+00:00,#SriLanka surcharge on fuel removed!\n⛽📉\nThe ...,negative,surcharge fuel removed The surcharge Rs impose...
413231,2020-06-04 15:41:45+00:00,Net issuance increases to fund fiscal programs...,positive,Net issuance increase fund fiscal program yiel...
760262,2020-07-03 19:39:35+00:00,RT @bentboolean: How much of Amazon's traffic ...,positive,How much Amazons traffic served Fastly Help u ...
830153,2020-07-09 14:39:14+00:00,$AMD Ryzen 4000 desktop CPUs looking ‘great’ a...,positive,AMD Ryzen desktop CPUs looking great track launch


In [12]:
eng = eng.replace({'sentiment': 'positive'}, {'sentiment': 0})
eng = eng.replace({'sentiment': 'neutral'}, {'sentiment': 1})
eng = eng.replace({'sentiment': 'negative'}, {'sentiment': 2})

eng.head()

Unnamed: 0_level_0,created_at,text,sentiment,text2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
77522,2020-04-15 01:03:46+00:00,"RT @RobertBeadles: Yo💥\nEnter to WIN 1,000 Mon...",0,Yo Enter WIN Monarch Tokens US Stock Market Cr...
661634,2020-06-25 06:20:06+00:00,#SriLanka surcharge on fuel removed!\n⛽📉\nThe ...,2,surcharge fuel removed The surcharge Rs impose...
413231,2020-06-04 15:41:45+00:00,Net issuance increases to fund fiscal programs...,0,Net issuance increase fund fiscal program yiel...
760262,2020-07-03 19:39:35+00:00,RT @bentboolean: How much of Amazon's traffic ...,0,How much Amazons traffic served Fastly Help u ...
830153,2020-07-09 14:39:14+00:00,$AMD Ryzen 4000 desktop CPUs looking ‘great’ a...,0,AMD Ryzen desktop CPUs looking great track launch


In [13]:
eng['dic_s'] = ""
eng.head()

Unnamed: 0_level_0,created_at,text,sentiment,text2,dic_s
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
77522,2020-04-15 01:03:46+00:00,"RT @RobertBeadles: Yo💥\nEnter to WIN 1,000 Mon...",0,Yo Enter WIN Monarch Tokens US Stock Market Cr...,
661634,2020-06-25 06:20:06+00:00,#SriLanka surcharge on fuel removed!\n⛽📉\nThe ...,2,surcharge fuel removed The surcharge Rs impose...,
413231,2020-06-04 15:41:45+00:00,Net issuance increases to fund fiscal programs...,0,Net issuance increase fund fiscal program yiel...,
760262,2020-07-03 19:39:35+00:00,RT @bentboolean: How much of Amazon's traffic ...,0,How much Amazons traffic served Fastly Help u ...,
830153,2020-07-09 14:39:14+00:00,$AMD Ryzen 4000 desktop CPUs looking ‘great’ a...,0,AMD Ryzen desktop CPUs looking great track launch,


In [14]:
def penn_to_wn(tag):
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('Y'):
        return wn.VERB
    return None

def clean_text(text):
    text = text.replace('<br />', "")
    return text

def swn_polarity(text):
    sentiment = 0.0
    tokens_count = 0

    text = clean_text(text)

    raw_sentences = sent_tokenize(text)
    for raw_sentence in raw_sentences:
        tagged_sentence = pos_tag(word_tokenize(raw_sentence))

        for word, tag in tagged_sentence:
            wn_tag = penn_to_wn(tag)
            if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
                continue

            lemma = lemmatizer.lemmatize(word, pos=wn_tag)

            if not lemma:
                continue

            synsets = wn.synsets(lemma, pos = wn_tag)
            if not synsets:
                continue

            synset = synsets[0]
            swn_synset = swn.senti_synset(synset.name())

            sentiment += swn_synset.pos_score() - swn_synset.neg_score()
            tokens_count += 1

    if not tokens_count:
        return 1

    if sentiment > 0:
        return 0

    elif sentiment == 0:
        return 1

    return 2

for index, row in eng.loc[:, :].iterrows():
    text = row['text2']
#    print(f'Text {index} : {text.strip()}')
 #   print('Sentiment :', row['sentiment'])
  #  print('Predicted Sentiment polarity : ', swn_polarity(text))
   # print('-'*60)
    eng['dic_s'][index] = swn_polarity(text)

In [15]:
eng['dic_s'].value_counts()

0    539
1    442
2    319
Name: dic_s, dtype: int64

In [16]:
eng.head(20)

Unnamed: 0_level_0,created_at,text,sentiment,text2,dic_s
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
77522,2020-04-15 01:03:46+00:00,"RT @RobertBeadles: Yo💥\nEnter to WIN 1,000 Mon...",0,Yo Enter WIN Monarch Tokens US Stock Market Cr...,0
661634,2020-06-25 06:20:06+00:00,#SriLanka surcharge on fuel removed!\n⛽📉\nThe ...,2,surcharge fuel removed The surcharge Rs impose...,1
413231,2020-06-04 15:41:45+00:00,Net issuance increases to fund fiscal programs...,0,Net issuance increase fund fiscal program yiel...,2
760262,2020-07-03 19:39:35+00:00,RT @bentboolean: How much of Amazon's traffic ...,0,How much Amazons traffic served Fastly Help u ...,0
830153,2020-07-09 14:39:14+00:00,$AMD Ryzen 4000 desktop CPUs looking ‘great’ a...,0,AMD Ryzen desktop CPUs looking great track launch,0
27027,2020-04-12 21:52:56+00:00,RT @QuantTrend: Reduce your portfolio RISK! GO...,0,Reduce portfolio RISK GOLD perfect tail HEDGE ...,0
472959,2020-06-09 05:23:06+00:00,$863.69 Million in Sales Expected for Spirit A...,0,Million Sales Expected Spirit AeroSystems Hold...,1
392845,2020-06-02 01:12:29+00:00,RT @ArjunKharpal: #Apple has cut the prices of...,2,cut price iPhone range China Its uncommon move...,0
313771,2020-05-07 04:58:41+00:00,RT @SMA_alpha: The #CDC U.S. New Case data has...,2,The US New Case data day lag saw another encou...,0
267894,2020-05-04 15:16:29+00:00,Where to Look for Dependable Dividends\nRead M...,1,Where Look Dependable Dividends Read More,0


In [17]:
true = 0

for index, row in eng.loc[:, :].iterrows():
    if eng['sentiment'][index] == eng['dic_s'][index]:
        true += 1

print(true)

641


In [18]:
print('accuracy = ', 641/1300)

accuracy =  0.4930769230769231


### 치명적 실수
긍정을 부정으로, 부정을 긍적으로

In [28]:
fatal_mistakes = 0

for index, row in eng.loc[:, :].iterrows():
    if (eng['sentiment'][index] - eng['dic_s'][index])**2 == 4:
        fatal_mistakes += 1

print(fatal_mistake)

209


In [32]:
print('fatal_mistakes = ', fatal_mistake/1300)

fatal_mistakes =  0.16076923076923078


### 약간의 실수
한단계씩 다를때

In [None]:
mistakes = 0

for index, row in eng.loc[:, :].iterrows():
    if (eng['sentiment'][index] - eng['dic_s'][index])**2 == 1:
        mistakes += 1

print(mistakes)

In [33]:
print('mistakes = ', mistakes/1300)

mistakes =  0.34615384615384615


In [31]:
true+fatal_mistakes+mistakes

1300