In [1]:
import tensorflow as tf
from keras import backend as K

num_cores = 4
CPU = True
GPU = False

if GPU:
    num_GPU = 1
    num_CPU = 1
if CPU:
    num_CPU = 1
    num_GPU = 0

config = tf.ConfigProto(intra_op_parallelism_threads=num_cores,\
        inter_op_parallelism_threads=num_cores, allow_soft_placement=True,\
        device_count = {'CPU' : num_CPU, 'GPU' : num_GPU})
session = tf.Session(config=config)
K.set_session(session)

Using TensorFlow backend.


In [2]:
import numpy as np
import pandas as pd
from keras.preprocessing.text import *
import matplotlib.pyplot as plt
from tokenizer import tokenizer as tweet_tokenizer
from keras.utils.np_utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.layers import *
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from keras.models import Sequential


%matplotlib inline

PATH = '/home/epita/sim/sentiment_analysis/data/'

train_3 = f'{PATH}/data_train_3.csv'
test_3 = f'{PATH}/data_test_3.csv'
train_7 = f'{PATH}/data_train_7.csv'
train_16m_3 = f'{PATH}/training.1600000.processed.noemoticon.csv'

## One hot sample

In [16]:
text = 'The quick brown fox jumped over the lazy dog.'

words = set(text_to_word_sequence(text))
vocab_size = len(words)
print(vocab_size)

result = one_hot(text, round(vocab_size*1.3))
print(result)

8
[6, 2, 7, 1, 4, 4, 6, 6, 7]


## Tweet Tokenizer sample

In [60]:
T = tweet_tokenizer.TweetTokenizer(preserve_case=False, preserve_url=False)

text = 'The quick brown fox jumped over the lazy dog.'

T.tokenize(text)

['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog', '.']

## Lemmatizer / Stemmer samples

In [22]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
 
stemmer = PorterStemmer()
lemmatiser = WordNetLemmatizer()
 
print("Stem %s: %s" % ("studying", stemmer.stem("studying")))
print("Lemmatise %s: %s" % ("studying", lemmatiser.lemmatize("studying")))
print("Lemmatise %s: %s" % ("studying", lemmatiser.lemmatize("studying", pos="v")))

Stem studying: studi
Lemmatise studying: studying
Lemmatise studying: study


In [33]:
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from collections import defaultdict
from nltk.corpus import wordnet as wn


tag_map = defaultdict(lambda: wn.NOUN, {'J': wn.ADJ, 'V': wn.VERB, 'R': wn.ADV})

s = "This is a simple sentence that would be allowing us to try lemmatizing"

tokens = word_tokenize(s)
tokens_pos = pos_tag(tokens)
 
print(tokens_pos)


lems = [lemmatiser.lemmatize(word, tag_map[pos[0]]) for word, pos in tokens_pos]
print(lems)

[('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('simple', 'JJ'), ('sentence', 'NN'), ('that', 'WDT'), ('would', 'MD'), ('be', 'VB'), ('allowing', 'VBG'), ('us', 'PRP'), ('to', 'TO'), ('try', 'VB'), ('lemmatizing', 'VBG')]
['This', 'be', 'a', 'simple', 'sentence', 'that', 'would', 'be', 'allow', 'u', 'to', 'try', 'lemmatizing']


# Serious business

In [3]:
tweets = pd.read_csv(train_3, sep='\t', names=['ID', 'Class', 'Tweet'])

tweets.shape

(50333, 3)

In [4]:
tweets.sample(5)

Unnamed: 0,ID,Class,Tweet
12085,229397284352299009,positive,Miss WV Teen USA 1ST runner up at Miss Teen US...
2106,264191178113708034,neutral,@ImmortalTech Dublin this Saturday... Gotta ge...
3731,263149282939527168,neutral,If you\u2019re calling this little thing right...
1827,263740861031010304,neutral,HT: Norwich 0-0 #Spurs. We\u2019ve edged the 1...
45312,638491743185297408,negative,OMG A Trump supporter on #CNN just suggested t...


In [8]:
from TP_transfer_learning_2018 import *
from TP_transfer_learning_2018.preprocessing import standardization


tweets['Sentiment'] = tweets['Class'].apply(lambda x: {'negative': 0, 'neutral': 1, 'positive': 2}[x])
tweets['Tweet'] = tweets['Tweet'].apply(lambda x: standardization(x))

In [9]:
display(tweets.sample(5))
tweets.sample(5)['Tweet'].iloc[0]

Unnamed: 0,ID,Class,Tweet,Sentiment
358,263251461268451331,neutral,wiki say 3rd marriage rt hahaha try understand...,1
29187,636697580571918336,neutral,t2 1o white sox challenge play 1st call field ...,1
48836,261600914807324673,positive,calle softy happy thursday bikers actually mak...,2
18517,638215956557271040,neutral,think might walk t-mobile get phone tomorrow d...,1
42167,641335091088728064,neutral,go bernie sander rally woodruff park 17th,1


'hey friend dst end sunday 11/4 give heads-up set clock back 1hr bed saturday evening ☺ ️'

In [10]:
train_tweets, sentiments = tweets['Tweet'], tweets['Sentiment']

all_tweets = train_tweets # + test_tweets
tokenizer = Tokenizer(filters=' ')
tokenizer.fit_on_texts(all_tweets)
word_index = tokenizer.word_index

In [11]:
train_sequences = tokenizer.texts_to_sequences(train_tweets)

sequences = train_sequences # + test_sequences
MAX_SEQUENCE_LENGTH = 0
for elt in sequences:
    if len(elt) > MAX_SEQUENCE_LENGTH:
        MAX_SEQUENCE_LENGTH = len(elt)

MAX_SEQUENCE_LENGTH

32

In [12]:
train_sequences = pad_sequences(train_sequences, MAX_SEQUENCE_LENGTH)
train_sequences.shape

(50333, 32)

In [13]:
googlenews_w2v = KeyedVectors.load_word2vec_format('data/embeddings/GoogleNews-vectors-negative300.bin', binary=True)

In [14]:
targets = to_categorical(sentiments, 3)
nb_words = len(word_index) + 1

EMBEDDING_DIM = 300
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))

oov = []  # Out of vocabulary
oov.append((np.random.rand(EMBEDDING_DIM) * 2.0) - 1.0)
oov = oov / np.linalg.norm(oov)

print(oov.shape)

for word, i in word_index.items():
    if word in googlenews_w2v.vocab:
        embedding_matrix[i] = googlenews_w2v.word_vec(word)
    else:
        embedding_matrix[i] = oov
        
print(embedding_matrix.shape)

(1, 300)
(36968, 300)


In [15]:
X_train, X_val, y_train, y_val = train_test_split(train_sequences, targets, test_size=0.3)

print('training set: ' + str(len(X_train)) + ' samples')
print('validation set: ' + str(len(X_val)) + ' samples')

print('x_train:', X_train.shape)
print('y_train:', y_train.shape)

training set: 35233 samples
validation set: 15100 samples
x_train: (35233, 32)
y_train: (35233, 3)


In [16]:
model = Sequential()
model.add(Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], 
                    input_length=MAX_SEQUENCE_LENGTH, trainable=False, name='embedding_layer'))
model.add(LSTM(32))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(3, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['acc'])
          
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_layer (Embedding)  (None, 32, 300)           11090400  
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                42624     
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                1056      
_________________________________________________________________
dropout_2 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 99        
Total params: 11,134,179
Trainable params: 43,779
Non-trainable params: 11,090,400
___________________________________________________________

In [17]:
model.fit(X_train, y_train, batch_size=50, validation_data=(X_val, y_val), epochs=6)

Train on 35233 samples, validate on 15100 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7f51ccc4ab00>

In [18]:
model.fit(X_train, y_train, batch_size=50, validation_data=(X_val, y_val), epochs=6)

Train on 35233 samples, validate on 15100 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7f52fcc82518>

### Logistic Regression

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(binary=True)

X = cv.fit_transform(tweets['Tweet'])
target = tweets['Sentiment']

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


X_train, X_val, y_train, y_val = train_test_split(X, target)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr = LogisticRegression(C=c, solver='saga', multi_class='auto')
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" % (c, accuracy_score(y_val, lr.predict(X_val))))

Accuracy for C=0.01: 0.6240464081373173
Accuracy for C=0.05: 0.6523363000635728
Accuracy for C=0.25: 0.6594882390336936
Accuracy for C=0.5: 0.6562301335028607
Accuracy for C=1: 0.6511443102352193


#### Best and worst features

In [11]:
feature_to_coef = {word: coef for word, coef in zip(cv.get_feature_names(), lr.coef_[0])}

for best_positive in sorted(feature_to_coef.items(), key=lambda x: x[1])[:5]:
    print (best_positive)
    
print()
for best_negative in sorted(feature_to_coef.items(), key=lambda x: x[1], reverse=True)[:5]:
    print (best_negative)

('gifford', -1.2377102595086205)
('seinfeld', -1.208227717966838)
('bout', -1.121596723479545)
('bless', -1.1170007079307602)
('greatest', -1.0976803765492371)

('worst', 2.4448066170437612)
('sucks', 2.221116192138116)
('stupid', 2.2174777929515206)
('fucked', 2.1713174119575873)
('fuck', 2.127179056507483)


## Learning (pseudo-code)

In [None]:
Sequential()
Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False)
LSTM(32)
Dropout(0.2)
Dense(32, 'relu')
Dropout(0.2)
Dense(3, activation='softmax')

model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['acc'])

model.fit(...)

In [None]:
# transfer:

model.layers.pop()
model.layers.pop()
Dense(150, 'relu')
Dense(64, 'relu')
Dense(7, 'softmax')
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

## Notes
* Télécharger 50 millions de tweets pour le word2vec
* Metric pour le script du prof: pearson
* Utiliser pearson au lieu d'accuracy
* L'année dernière le prof a eu 78% accuracy 

**Output attendu**: produire un fichier avec les mêmes noms de cartégories sur un fichier de test façon kaggle