# Classical Machine Learning Approach

In [155]:
import pandas as pd
import numpy as np
import re
import nltk

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [156]:
nltk.download('stopwords')
stopwords = set(nltk.corpus.stopwords.words('portuguese'))

[nltk_data] Downloading package stopwords to /home/dan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [157]:
df_train = pd.read_csv("tweets_stock_clean.csv").set_index('tweet_id')
df_test = pd.read_csv("tweets_stocks-full_agreement.csv").set_index('tweet_id')

targets = ['TRU', 'DIS', 'JOY', 'SAD', 'ANT', 'SUR', 'ANG', 'FEA']

to_delete = ['NEUTRAL', 'conf_tru_dis', 'conf_joy_sad', 'conf_ant_sur',
       'conf_ang_fea', 'num_annot']

df_train.drop(columns=to_delete, inplace=True)
df_test.drop(columns=to_delete, inplace=True)

df_train

Unnamed: 0_level_0,text,TRU,DIS,JOY,SAD,ANT,SUR,ANG,FEA
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
446333972562591745l,enquanto isso #lame4 rs,0,0,-1,-1,0,0,0,0
446341582183464960l,"PETR4 subiu na bolsa 13,50. Muito bem, surpres...",1,0,1,0,0,1,0,0
448105739962548224l,"vai, oibr4. um troux... ops... investidor prec...",0,1,0,1,0,0,1,0
446250331123773440l,$LREN3 - Lojas Renner (lren-nm) - Declaracao E...,0,0,0,0,-1,-1,-2,-2
448130972039385089l,Barriga para dentro em uma semana - http://t.c...,-2,-2,-2,-2,-2,-2,0,0
...,...,...,...,...,...,...,...,...,...
456788707576532992l,$EMBR3 - Embraer (embr-nm) - Aviso Aos Acionis...,0,0,0,0,1,0,0,0
458688220092715008l,ABEV3: Oportunidade de compra (+ de 20% de alt...,1,0,0,0,1,0,0,0
444219554114195457l,BBAS3_Mensal !!! Alguém tem algum recado para ...,0,1,-2,-2,-2,-2,0,0
451468663569141760l,"Bradesco PN (BBDC4), Gráfico Semanal. Estudo t...",0,0,0,0,0,0,0,0


In [158]:
def clean_tweet(tweet):
    '''
    Utility function to clean tweet text by removing links, special characters
    using simple regex statements.
    '''
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())

df_train['text'] = df_train['text'].apply(lambda text: clean_tweet(text))
df_test['text'] = df_test['text'].apply(lambda text: clean_tweet(text))

In [159]:
for target in targets:
    df_train[target] = df_train[target] > 0 
    df_test[target] = df_test[target] > 0

In [160]:
vectorizer = CountVectorizer(analyzer="word", stop_words=stopwords, ngram_range=(1,2))

tweets = df_train['text'].values
freq_tweets = vectorizer.fit_transform(tweets)

tweets = df_test['text'].values
test_tweets = vectorizer.transform(tweets)

In [161]:
model = LogisticRegression()
results = df_test[targets].copy()

for target in targets:
    model.fit(freq_tweets, df_train[target])
    results[target] = model.predict(test_tweets)

In [162]:
y_pred = np.array(results).flatten()
y_true = np.array(df_test[targets]).flatten()

In [163]:
accuracy_score(y_pred, y_true)

0.9288922155688623

In [164]:
precision_score(y_pred, y_true)

0.15425531914893617

In [165]:
recall_score(y_pred, y_true)

0.48333333333333334

In [166]:
f1_score(y_pred, y_true)

0.2338709677419355