In [136]:
!pip install emoji --upgrade
!pip install nltk



In [117]:
import pandas as pd
import numpy as np
import re
import statistics
import emoji

emoji_effects = {'Positive': {},
                 'Negative': {}}

tweet_labels = {'Positive': [],
                'Negative': []}

dif_labels = {'Positive_tweet': [],
              'Negative_tweet': []}

open_labels = {'Positive_tweet': [],
               'Negative_tweet': []}

retweet_labels = {'Positive_tweet': [],
                  'Negative_tweet': []}

def tweet_already_exists(tweet_cmp):
    for tweet in tweet_labels['Positive']:
        if tweet == tweet_cmp:
            return True
    
    for tweet in tweet_labels['Negative']:
        if tweet == tweet_cmp:
            return True

    return False

def tweet_has_both_pos_and_neg_e(emojis_used):
    contain_pos = False
    contain_neg = False
    for e in emojis_used:
        if e['emoji'] in emoji_effects['Positive']:
            contain_pos = True
        if e['emoji'] in emoji_effects['Negative']:
            contain_neg = True
    
    if contain_pos and contain_neg:
        return True
    else:
        return False
    
def label_tweet(tweets, open, dif, retweets):
    index = 0
    marker1 = '\''
    marker2 = '\''
    regexPattern = marker1 + '(.+?)' + marker2
    first_list = re.findall(regexPattern, tweets)

    marker1 = '\"'
    marker2 = '\"'
    regexPattern = marker1 + '(.+?)' + marker2
    second_list = re.findall(regexPattern, tweets)
    tweets = first_list + second_list

    retweets = retweets.split(",")
    retweets[0] = retweets[0][1:]
    retweets[len(retweets) - 1] = retweets[len(retweets) - 1][:-1]

    #safe check
    if len(retweets) == len(tweets):
        for tweet in tweets:
            if tweet_already_exists(tweet):
                index+=1
                continue
            e_list = emoji.emoji_list(tweet)
            if tweet_has_both_pos_and_neg_e(e_list):
                index+=1
                continue
            for e in e_list:
                if e['emoji'] in emoji_effects['Positive']:
                    tweet_labels['Positive'].append(tweet)
                    dif_labels['Positive_tweet'].append(dif)
                    open_labels['Positive_tweet'].append(open)
                    retweet_labels['Positive_tweet'].append(retweets[index])
                elif e['emoji'] in emoji_effects['Negative']:
                    tweet_labels['Negative'].append(tweet)
                    dif_labels['Negative_tweet'].append(dif)
                    open_labels['Negative_tweet'].append(open)
                    retweet_labels['Negative_tweet'].append(retweets[index])
            index+=1

def assign_tweet_label(dataset):
    for index, row in dataset.iterrows():
        label_tweet(row['Tweets'], row['Close'], row['Dif'], row['Retweets'])

def remove_zero_count_emoji():
    emoji_effects['Positive'] = {k:v for k,v in emoji_effects['Positive'].items() if v != 0}
    emoji_effects['Negative'] = {k:v for k,v in emoji_effects['Negative'].items() if v != 0}

def sort_emoji_effects():
    sorted_pos = sorted(emoji_effects['Positive'].items(), key=lambda x:x[1], reverse=True)
    emoji_effects['Positive'] = {}
    for item in sorted_pos:
        emoji_effects['Positive'][item[0]] = item[1]

    sorted_neg = sorted(emoji_effects['Negative'].items(), key=lambda x:x[1], reverse=True)
    emoji_effects['Negative'] = {}
    for item in sorted_neg:
        emoji_effects['Negative'][item[0]] = item[1]

def assign_emoji_label(dataset):
    for index, row in dataset.iterrows():
        add_emoji_effect(row['Dif'], row['Tweets'])

    e_positive = set(emoji_effects['Positive'])
    e_negative = set(emoji_effects['Negative'])
    for e in e_positive.intersection(e_negative):
        # if this emoji has more positive effect, remove it from the negative dict. 
        # Effectively assigning it to be positive
        if emoji_effects['Positive'][e] >= emoji_effects['Negative'][e]:
            del emoji_effects['Negative'][e]
        elif emoji_effects['Positive'][e] < emoji_effects['Negative'][e]:
            del emoji_effects['Positive'][e]
    sort_emoji_effects()
    remove_zero_count_emoji()
            

def add_emoji_effect(stock_dif, tweets):
        for tweet in tweets:
            e_list = emoji.emoji_list(tweet)
            for e in e_list:
                if stock_dif >= 0:
                    if e['emoji'] not in emoji_effects['Positive'].keys():
                        emoji_effects['Positive'][e['emoji']] = 0
                    else:
                        emoji_effects['Positive'][e['emoji']] += 1
                elif stock_dif < 0:
                    if e['emoji'] not in emoji_effects['Negative'].keys():
                        emoji_effects['Negative'][e['emoji']] = 0
                    else:
                        emoji_effects['Negative'][e['emoji']] += 1

def get_all_retweets(dataset):
    ret = []
    for index, row in dataset.iterrows():
        retweets = row['Retweets'].split(",")
        retweets[0] = retweets[0][1:]
        retweets[len(retweets) - 1] = retweets[len(retweets) - 1][:-1]
        ret = ret + retweets
    return ret

def get_retweet_num_std(dataset):
    return statistics.stdev([eval(i) for i in get_all_retweets(dataset)])

def get_retweet_num_mean(dataset):
    return statistics.mean([eval(i) for i in get_all_retweets(dataset)])

def preprocess_data(dataset):
    assign_emoji_label(dataset)
    assign_tweet_label(dataset)

In [132]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

dataset = pd.read_csv("results.csv")
dif_features = []
open_features = []
labels = []

#steps for preprocessing dataset:
# 1. emoticons are labeled by looking at stock price changes. Then put into 2 groups, negative and positive
# 2. Tweets that contain these emoticons are then labeled
# 3. Remove duplicate tweets
# 4. Remove tweets that have both positive and negative emoticons
preprocess_data(dataset)

retweet_num_std = get_retweet_num_std(dataset)
retweet_num_mean = get_retweet_num_mean(dataset)

def within_one_std(num_retweets):
    ret = False
    if (num_retweets < (retweet_num_mean + retweet_num_std)) and (num_retweets > (retweet_num_mean - retweet_num_std)):
        ret = True
    return ret

def within_two_std(num_retweets):
    ret = False
    if (num_retweets < (retweet_num_mean + (retweet_num_std * 2))) and (num_retweets > (retweet_num_mean - (retweet_num_std * 2))):
        ret = True
    return ret

def is_outlier(num_retweets):
    ret = False
    if (num_retweets > (retweet_num_mean + retweet_num_std)) or (num_retweets < (retweet_num_mean - retweet_num_std)):
        ret = True
    return ret

index = 0
for tweet_label in tweet_labels['Positive']:
    if within_one_std(int(retweet_labels['Positive_tweet'][index])):
        labels.append(1)
    elif within_two_std(int(retweet_labels['Positive_tweet'][index])):
        labels.append(2)
    elif is_outlier(int(retweet_labels['Positive_tweet'][index])):
        labels.append(3)
    index+=1
index = 0
for tweet_label in tweet_labels['Negative']:
    if within_one_std(int(retweet_labels['Negative_tweet'][index])):
        labels.append(-1)
    elif within_two_std(int(retweet_labels['Negative_tweet'][index])):
        labels.append(-2)
    elif is_outlier(int(retweet_labels['Negative_tweet'][index])):
        labels.append(-3)
    index+=1

for dif in dif_labels['Positive_tweet']:
    dif_features.append(dif)
for dif in dif_labels['Negative_tweet']:
    dif_features.append(dif)
for open in open_labels['Positive_tweet']:
    open_features.append(open)
for open in open_labels['Negative_tweet']:
    open_features.append(open)

features_arr = np.arange(2 * (len(tweet_labels['Positive']) + len(tweet_labels['Negative']))).reshape(2, (len(tweet_labels['Positive']) + len(tweet_labels['Negative'])))
features_arr[0] = np.array(open_features)
features_arr[1] = np.array(dif_features)
features_arr = features_arr.T
labels = np.array(labels)

X_train, X_test, y_train, y_test = train_test_split(features_arr, labels)
estimator = []
estimator.append(('NB', 
                  GaussianNB()))
estimator.append(('SVC', SVC(gamma ='auto', probability = True)))
estimator.append(('DTC', DecisionTreeClassifier()))

vot_hard = VotingClassifier(estimators = estimator, voting ='hard')
vot_hard.fit(X_train, y_train)
y_pred = vot_hard.predict(X_test)

accuracy_score = accuracy_score(y_test, y_pred)
print("Accuracy score: ", accuracy_score)

precision_score = precision_score(y_test, y_pred, average='weighted')
print("Precision score: ", precision_score)

Accuracy score:  0.7631578947368421
Precision score:  0.6775879519754293


  _warn_prf(average, modifier, msg_start, len(result))
