In [1]:
import numpy as np
import pandas as pd
from keras.preprocessing.text import *
import matplotlib.pyplot as plt
from tokenizer import tokenizer

%matplotlib inline

PATH = '/home/epita/sim/sentiment_analysis/data/'

train_3 = f'{PATH}/data_train_3.csv'
test_3 = f'{PATH}/data_test_3.csv'
train_7 = f'{PATH}/data_train_7.csv'
train_16m_3 = f'{PATH}/training.1600000.processed.noemoticon.csv'

Using TensorFlow backend.


## One hot sample

In [16]:
text = 'The quick brown fox jumped over the lazy dog.'

words = set(text_to_word_sequence(text))
vocab_size = len(words)
print(vocab_size)

result = one_hot(text, round(vocab_size*1.3))
print(result)

8
[6, 2, 7, 1, 4, 4, 6, 6, 7]


## Tweet Tokenizer sample

In [17]:
T = tokenizer.TweetTokenizer(preserve_case=False, preserve_url=False)

T.tokenize(text)

['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog', '.']

## Lemmatizer / Stemmer samples

In [22]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
 
stemmer = PorterStemmer()
lemmatiser = WordNetLemmatizer()
 
print("Stem %s: %s" % ("studying", stemmer.stem("studying")))
print("Lemmatise %s: %s" % ("studying", lemmatiser.lemmatize("studying")))
print("Lemmatise %s: %s" % ("studying", lemmatiser.lemmatize("studying", pos="v")))

Stem studying: studi
Lemmatise studying: studying
Lemmatise studying: study


In [33]:
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from collections import defaultdict
from nltk.corpus import wordnet as wn


tag_map = defaultdict(lambda: wn.NOUN, {'J': wn.ADJ, 'V': wn.VERB, 'R': wn.ADV})

s = "This is a simple sentence that would be allowing us to try lemmatizing"

tokens = word_tokenize(s)
tokens_pos = pos_tag(tokens)
 
print(tokens_pos)


lems = [lemmatiser.lemmatize(word, tag_map[pos[0]]) for word, pos in tokens_pos]
print(lems)

[('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('simple', 'JJ'), ('sentence', 'NN'), ('that', 'WDT'), ('would', 'MD'), ('be', 'VB'), ('allowing', 'VBG'), ('us', 'PRP'), ('to', 'TO'), ('try', 'VB'), ('lemmatizing', 'VBG')]
['This', 'be', 'a', 'simple', 'sentence', 'that', 'would', 'be', 'allow', 'u', 'to', 'try', 'lemmatizing']


# Serious business

In [2]:
tweets = pd.read_csv(train_3, sep='\t', names=['ID', 'Sentiment', 'Tweet'])

tweets.shape

(50333, 3)

In [3]:
tweets.sample(5)

Unnamed: 0,ID,Sentiment,Tweet
2319,256539059999109121,negative,Buffalo Bills\u2019 Kyle Williams Misses Pract...
42105,639979942617436160,positive,@bowiescheekbone I just wrote a prayer to Bern...
35144,641038093928263680,neutral,"With an ace, Murray takes the 3rd set 7-6(2) ..."
3425,255108367037636611,neutral,@KolaptimusPrime He bet me 5 bucks the Lions w...
38821,676631249188839424,positive,"Kendrick is reinventing the rap industry, he m..."


In [4]:
tweets['Sentiment'] = tweets['Sentiment'].apply(lambda x: {'negative': -1, 'neutral': 0, 'positive': 1}[x])

In [5]:
tweets.sample(5)

Unnamed: 0,ID,Sentiment,Tweet
35,264224174153818113,0,I may exit off twitter and fb and thug with i...
42654,641278049208569856,1,Selling 1 ticket to Chris Brown this Thursday ...
38515,675816742258475008,0,Glamorous dinner options: clam chowder or a ch...
45850,638291604537630720,0,Chelsea and Juventus in Pogba Talks: Now with ...
10192,240977151531298816,1,$224.60. Not a bad payoff for a 2nd race Trif...


In [6]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(binary=True)

X = cv.fit_transform(tweets['Tweet'])
target = tweets['Sentiment']

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


X_train, X_val, y_train, y_val = train_test_split(X, target)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr = LogisticRegression(C=c, solver='saga', multi_class='auto')
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" % (c, accuracy_score(y_val, lr.predict(X_val))))

Accuracy for C=0.01: 0.6240464081373173
Accuracy for C=0.05: 0.6523363000635728
Accuracy for C=0.25: 0.6594882390336936
Accuracy for C=0.5: 0.6562301335028607
Accuracy for C=1: 0.6511443102352193


### Best and worst features

In [11]:
feature_to_coef = {word: coef for word, coef in zip(cv.get_feature_names(), lr.coef_[0])}

for best_positive in sorted(feature_to_coef.items(), key=lambda x: x[1])[:5]:
    print (best_positive)
    
print()
for best_negative in sorted(feature_to_coef.items(), key=lambda x: x[1], reverse=True)[:5]:
    print (best_negative)

('gifford', -1.2377102595086205)
('seinfeld', -1.208227717966838)
('bout', -1.121596723479545)
('bless', -1.1170007079307602)
('greatest', -1.0976803765492371)

('worst', 2.4448066170437612)
('sucks', 2.221116192138116)
('stupid', 2.2174777929515206)
('fucked', 2.1713174119575873)
('fuck', 2.127179056507483)


## Learning (pseudo-code)

In [None]:
Sequential()
Embedding(len(word_index) + 1, EMBEDDING_PATH...?, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False)
LSTM(32)
Dropout(0.2)
Dense(32, 'relu')
Dropout(0.2)
Dense(3, activation='softmax')

model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['acc'])

model.fit(...)

In [None]:
# transfer:

model.layers.pop()
model.layers.pop()
Dense(150, 'relu')
Dense(64, 'relu')
Dense(7, 'softmax')
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

## Notes
* Télécharger 50 millions de tweets pour le word2vec
* Metric pour le script du prof: pearson
* Utiliser pearson au lieu d'accuracy
* L'année dernière le prof a eu 78% accuracy 

**Output attendu**: produire un fichier avec les mêmes noms de cartégories sur un fichier de test façon kaggle