# Deep Learning for NLP - Project

RULES:

* Do not create any additional cell

* Fill in the blanks

* All cells should be runnable (modulo trivial compatibility bugs that we'd fix)

* 4 / 20 points will be allocated to the clarity of your code

* Efficient code will have a bonus

DELIVERABLE:

* this notebook
* the predictions of the SST test set

DO NOT INCLUDE THE DATASETS IN THE DELIVERABLE..

In [1]:
import io
import os
import numpy as np
import scipy

In [2]:
PATH_TO_DATA = "data/"

# 1) Monolingual (English) word embeddings 

In [3]:
class Word2vec():
    def __init__(self, fname, nmax=100000):
        self.load_wordvec(fname, nmax)
        self.word2id = self.word2vec.keys()
        self.id2word = [k for k in self.word2id]
        self.embeddings = np.array([vec for vec in self.word2vec.values()], ndmin = 2)
    
    def load_wordvec(self, fname, nmax):
        self.word2vec = {}
        with io.open(fname, encoding='utf-8') as f:
            next(f)
            for i, line in enumerate(f):
                word, vec = line.split(' ', 1)
                self.word2vec[word] = np.fromstring(vec, sep=' ')
                if i == (nmax - 1):
                    break
        print('Loaded %s pretrained word vectors' % (len(self.word2vec)))

    def most_similar(self, w, K=5):
        # K most similar words: self.score  -  np.argsort
        # K most similar words: self.score  -  np.argsort
        scorelist=[]
        #size = len(self.id2word)
        #count = 0
        for wo in list(self.id2word):
            scorelist.append(self.score(w,wo))
            #count = count + 1
            #if count%1000 == 0:
            #    print(str(count)+'/'+str(size)+' processed')
        #idxs=sorted(scorelist)[::-1][:K+1]
        idxs=np.argsort(scorelist)[::-1][:K+1]
        most_similar_word = []
        for i in idxs:
            most_similar_word.append(list(self.word2vec.keys())[i])
        return most_similar_word
        
        return idxs

    def score(self, w1, w2):
        # cosine similarity: np.dot  -  np.linalg.norm
        idx1=list(self.id2word).index(w1)
        idx2=list(self.id2word).index(w2)
        v1=self.embeddings[idx1]
        v2=self.embeddings[idx2] 
        score=np.dot(v1,v2.T)/(np.linalg.norm(v1)*np.linalg.norm(v2))  # dot-product of normalized vector = cosine similarity
        return  score

In [5]:
w2v = Word2vec(os.path.join(PATH_TO_DATA, 'crawl-300d-200k.vec'), nmax=55000)

# You will be evaluated on the output of the following:
for w1, w2 in zip(('cat', 'dog', 'dogs', 'paris', 'germany'), ('dog', 'pet', 'cats', 'france', 'berlin')):
    print(w1, w2, w2v.score(w1, w2))
for w1 in ['cat', 'dog', 'dogs', 'paris', 'germany']:
    print(w2v.most_similar(w1))

Loaded 55000 pretrained word vectors
cat dog 0.671683666279249
dog pet 0.6842064029669219
dogs cats 0.7074389328052403
paris france 0.7775108541288561
germany berlin 0.7420295235998392
['cat', 'cats', 'kitty', 'kitten', 'feline', 'kitties']
['dog', 'dogs', 'puppy', 'Dog', 'doggie', 'canine']
['dogs', 'dog', 'Dogs', 'doggies', 'canines', 'puppies']
['paris', 'france', 'Paris', 'london', 'berlin', 'europe']
['germany', 'europe', 'german', 'berlin', 'france', 'italy']


In [6]:
class BoV():
    def __init__(self, w2v):
        self.w2v = w2v
    
    
    def encode(self,sentences, idf=False):
        # takes a list of sentences, outputs a numpy array of sentence embeddings
        # see TP1 for help
        self.sentemb = []
        # self.sentences = sentences
        for sent in sentences:
            if idf is False:
                mv=np.mean([w2v.word2vec[w] if w in w2v.word2vec else np.zeros((300,))for w in sent ], axis=0)
                self.sentemb.append(mv)
            else:
                # idf-weighted mean of word vectors
                mv=np.mean([w2v.word2vec[w]*idf[w] if w in w2v.word2vec else np.zeros((300,))for w in sent ], axis=0)
                self.sentemb.append(mv)
        
        return np.vstack(self.sentemb)

    def most_similar(self, s, sentences, idf=False, K=5):
        # get most similar sentences and **print** them
        keys = self.encode(sentences)
        query = self.encode([s])
        idx_s =sentences.index(s)
        similarities = keys*query/np.linalg.norm(keys)/np.sqrt(np.sum(keys**2, axis = 0))
        similarities = np.sum(similarities, axis = 1)
        idxs=np.argsort(similarities)[-K:]
        # return [sentences[i] for i in np.argsort(similarities)[-K:]]
        print('\n Top-%s similar sentences of \n"%s" : \n\n' % (K, ' '.join(sentences[idx_s])))
        for i, idx in enumerate(idxs):
            print('%s) %s' % (i + 1, ' '.join(sentences[idx])))
        
        
    def score(self, s1, s2, idf=False):
        # cosine similarity: use   np.dot  and  np.linalg.norm
        sentence_emb = self.encode(sentences)
        idx1 = sentences.index(s1)
        idx2 = sentences.index(s2)
        v1 = sentence_emb[idx1]
        v2 = sentence_emb[idx2]
        score = np.dot(v1, v2.T)/(np.linalg.norm(v1)*np.linalg.norm(v2))
        print('\n The score between "%s" and "%s" is %s' %(' '.join(sentences[idx1]),' '.join(sentences[idx2]), score))
    
  
    def build_idf(self, sentences):
        # build the idf dictionary: associate each word to its idf value
        idf={}
        for sent in sentences:
            for w in set(sent):
                idf[w]= max(1,np.log10(len(sentences) / (idf.get(w, 0) + 1)))
        return idf

In [7]:
w2v = Word2vec(os.path.join(PATH_TO_DATA, 'crawl-300d-200k.vec'), nmax=5000)
s2v = BoV(w2v)

# Load sentences in "PATH_TO_DATA/sentences.txt"
sentences = []
with open('data/sentences.txt', 'r', encoding = 'utf-8') as file:
    for line in file:
        sentences += [line.split(' ')[:-1]]# we get rid of the \n
file.close()

# Build idf scores for each word
idf = {} if True else s2v.build_idf(sentences)

# You will be evaluated on the output of the following:
s2v.most_similar('' if not sentences else sentences[10], sentences)  # BoV-mean
s2v.score('' if not sentences else sentences[7], '' if not sentences else sentences[13])


idf = {}  
s2v.most_similar('' if not sentences else sentences[10], sentences, idf)  # BoV-idf
s2v.score('' if not sentences else sentences[7], '' if not sentences else sentences[13], idf)

Loaded 5000 pretrained word vectors

 Top-5 similar sentences of 
"1 smiling african american boy ." : 


1) a boy jumps on another boy .
2) a boy skateboarding
3) teen boy playing billiards .
4) boy riding a horse .
5) boy plays baseball .

 The score between "1 man singing and 1 man playing a saxophone in a concert ." and "10 people venture out to go crosscountry skiing ." is 0.6089445116147131

 Top-5 similar sentences of 
"1 smiling african american boy ." : 


1) a boy jumps on another boy .
2) a boy skateboarding
3) teen boy playing billiards .
4) boy riding a horse .
5) boy plays baseball .

 The score between "1 man singing and 1 man playing a saxophone in a concert ." and "10 people venture out to go crosscountry skiing ." is 0.6089445116147131


# 2) Multilingual (English-French) word embeddings

Let's consider a bilingual dictionary of size V_a (e.g French-English).

Let's define **X** and **Y** the **French** and **English** matrices.

They contain the embeddings associated to the words in the bilingual dictionary.

We want to find a **mapping W** that will project the source word space (e.g French) to the target word space (e.g English).

Procrustes : **W\* = argmin || W.X - Y ||  s.t  W^T.W = Id**
has a closed form solution:
**W = U.V^T  where  U.Sig.V^T = SVD(Y.X^T)**

In what follows, you are asked to: 

In [8]:
# 1 - Download and load 50k first vectors of
#     https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.en.vec
#     https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.fr.vec

def load_data_vec(url, max_count, output_file):
    r = requests.get(url, stream = True)
    data = {}
    counter =0
    with open(output_file, 'w', encoding = "utf-8") as output_file:
        for line in r.iter_lines():
            if counter >= max_count:
                break
            splits = str(line).split(" ")
            if len(splits) == 302:
                counter += 1
                line = str(line).replace("b'", "").replace("'b", "")
            
                output_file.write(str(line)+ "\n")
    output_file.close()
    return data



import requests

english_vec_url = "https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.en.vec"
french_vec_url = "https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.fr.vec"
download = False
if download : french_vec = load_data_vec(french_vec_url, 50000, os.path.join(PATH_TO_DATA, "french.vec"))
if download : english_vec = load_data_vec(english_vec_url, 50000, os.path.join(PATH_TO_DATA, "english.vec"))
    
w2vfrench = Word2vec(os.path.join(PATH_TO_DATA, 'french.vec'), nmax=50000)
w2venglish = Word2vec(os.path.join(PATH_TO_DATA, 'english.vec'), nmax=50000)


Loaded 49999 pretrained word vectors
Loaded 49999 pretrained word vectors


In [9]:
# 2 - Get words that appear in both vocabs (= identical character strings)
#     Use it to create the matrix X and Y (of aligned embeddings for these words)

english_list =[]
french_list = []

english_list=list(w2venglish.word2id)
french_list=list(w2vfrench.word2id)
common_words = set(list(w2venglish.word2id))& set(list(w2vfrench.word2id))
common_words = [ x for x in iter(common_words)]


X_matrix = []
Y_matrix = []
for i in range(len(common_words)):
    X_matrix.append(w2vfrench.word2vec[common_words[i]])
    Y_matrix.append(w2venglish.word2vec[common_words[i]])
    
# return training matrices
X=np.array(X_matrix).T
Y=np.array(Y_matrix).T

In [10]:
# 3 - Solve the Procrustes using the scipy package and: scipy.linalg.svd() and get the optimal W
#     Now W*French_vector is in the same space as English_vector


U,s,V = np.linalg.svd(np.dot(Y,X.transpose()))
W = np.dot(U,V)

In [11]:
# 4 - After alignment with W, give examples of English nearest neighbors of some French words (and vice versa)
#     You will be evaluated on that part and the code above

def translation(word,source_matrix,translation_matrix, target_matrix, K = 5):
    target_matrix.embeddings=np.array(target_matrix.embeddings, ndmin=2)
    # create the vector source of the word we want to translate
    vector_source = source_matrix.word2vec[word].reshape(-1,)
    # translate with the translation matrix the vector source
    vector_target = np.dot(translation_matrix.T, vector_source).reshape(-1,)
    
    similarities = vector_target.T * target_matrix.embeddings/ np.linalg.norm(vector_target)/np.sqrt(np.sum(target_matrix.embeddings**2, axis = 0))
    similarities = np.sum(similarities, axis = 1)
    
    idxs=np.argsort(similarities)[-K:]

    print('Top-%s closest translations of \n"%s" are \n\n' % (K, word))
    for i, idx in enumerate(idxs):
        print('%s) %s' % (i + 1, target_matrix.id2word[idx]))
        
wordlist = ['cat', 'dog','queen','boy','language']
for wordtotranslate in wordlist:
    translation(wordtotranslate,w2venglish, W, w2vfrench)

Top-5 closest translations of 
"cat" are 


1) grizzly
2) chat
3) felis
4) canis
5) cat
Top-5 closest translations of 
"dog" are 


1) pig
2) dingo
3) chien
4) dog
5) hound
Top-5 closest translations of 
"queen" are 


1) princess
2) consort
3) reine
4) queen
5) \xe2\x94\x9c\xe2\x94\x80
Top-5 closest translations of 
"boy" are 


1) daughter
2) dogg
3) sailor
4) girl
5) boy
Top-5 closest translations of 
"language" are 


1) parl\xc3\xa9e
2) linguistics
3) linguistic
4) language
5) languages


If you want to dive deeper on this subject: https://github.com/facebookresearch/MUSE

# 3) Sentence classification with BoV and scikit-learn

In [20]:
# 1 - Load train/dev/test of Stanford Sentiment TreeBank (SST)
#     (https://nlp.stanford.edu/~socherr/EMNLP2013_RNTN.pdf)

def load_sentences(path):
    sentences = []
    with open(path, 'r', encoding = 'utf-8') as file:
        for line in file:
            sentences += [line.split(' ')[:-1]]# we get rid of the \n
    file.close()
    return sentences

test = load_sentences(os.path.join(PATH_TO_DATA, "stsa.fine.test.X"))
dev = load_sentences(os.path.join(PATH_TO_DATA, "stsa.fine.dev"))
train = load_sentences(os.path.join(PATH_TO_DATA, "stsa.fine.train"))

In [25]:
# 2 - Encode sentences with the BoV model above

s2v = BoV(w2v)
#build idf
train_idf=s2v.build_idf(train)
dev_idf=s2v.build_idf(dev)
test_idf=s2v.build_idf(test)

#defining labels
ydev=[int(x[0]) for x in dev]
ytrain=[int(x[0]) for x in train]
xtrain=[x[1:] for x in train]
xdev=[x[1:] for x in dev]

#encode training set sentences weighted average
Xtrain_idf = s2v.encode(xtrain, train_idf)
Xdev_idf = s2v.encode(xdev, dev_idf)
Xtest_idf = s2v.encode(test, test_idf)

#encode training set sentences mean average word vector
Xtrain = s2v.encode(xtrain)
Xdev = s2v.encode(xdev)
Xtest = s2v.encode(test)

In [26]:
# 3 - Learn Logistic Regression on top of sentence embeddings using scikit-learn
#     (consider tuning the L2 regularization on the dev set)
from sklearn.linear_model import LogisticRegression

bestreg_val_score=0
best_score=0
for reg in [2**t for t in range(-2, 4, 1)]:
    clf = LogisticRegression(C=reg, random_state=1234)
    clf.fit(Xtrain, ytrain)
    if clf.score(Xdev, ydev)>best_score:
        best_score=clf.score(Xdev,ydev)
        bestreg_val_score = reg
print('Best valid score:',best_score)
print('Best L2-reg parameter:',bestreg_val_score)

#grid search for optimization
from sklearn.model_selection import GridSearchCV
def optiGrid (x,y,xdev,ydev):
    parameters = {'C':[1, 1.2, 1.25, 1.3, 1.4, 1.5, 1.6, 1.7,1.8] }
    gs = GridSearchCV(LogisticRegression(), parameters)
    gs.fit(Xtrain, ytrain)
    p=list(gs.best_params_.values())[0]
    clf = LogisticRegression(C=p, random_state=1234)
    model=clf.fit(x, y)
    score=model.score(xdev, ydev)
    
    print("=================================== best parameters ===============================")
    print(" best prameter for".format(x,y))
    print(p)
    print("======================================score devset=================================")
    print(score)

optiGrid(Xtrain,ytrain,Xdev, ydev)
optiGrid(Xtrain_idf,ytrain,Xdev_idf, ydev)

Best valid score: 0.37693006357856496
Best L2-reg parameter: 0.5
 best prameter for
1.25
0.37693006357856496
 best prameter for
1.25
0.368755676657584


In [27]:
# 4 - Produce 2210 predictions for the test set (in the same order). One line = one prediction (=0,1,2,3,4).


clf = LogisticRegression(C=1.25, random_state=1234)
clf.fit(Xtrain, ytrain)
pred=clf.predict(Xtest)

#     Attach the output file "logreg_bov_y_test_sst.txt" to your deliverable.
#     You will be evaluated on the results of the test set.
np.savetxt(os.path.join(PATH_TO_DATA, "logreg_bov_y_test_sst.txt"),pred,fmt='%s',newline=os.linesep)

In [41]:
# BONUS!
# 5 - Try to improve performance with another classifier
#     Attach the output file "XXX_bov_y_test_sst.txt" to your deliverable (where XXX = the name of the classifier)

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
import time
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score


#fonction classifier
def test_classifier(X_train, y_train, X_dev, y_dev, classifier):
    print("")
    print("==================================================================================")
    classifier_name = str(type(classifier).__name__)
    print("Testing " + classifier_name)
    now = time.time()
    model = classifier.fit(X_train, y_train)
    print("Learing time {0}s".format(time.time() - now))
    now = time.time()
    s=model.score(X_dev, y_dev)
    print("Predicting time {0}s".format(time.time() - now))

    print("=================================== Results ======================================")
    print(" score " + str(s))

#multiple model
rf = RandomForestClassifier()
gbm = GradientBoostingClassifier()

test_classifier(Xtrain, ytrain, Xdev, ydev, rf)
test_classifier(Xtrain, ytrain, Xdev, ydev,gbm)

#grid search
def optiGridGB (x,y):
    gb_grid_params = {'learning_rate': [0.1, 0.05, 0.02, 0.01],
              'max_depth': [4, 6, 8],
              'min_samples_leaf': [20, 50,100,150],
              #'max_features': [1.0, 0.3, 0.1] 
              }

    gb = GradientBoostingClassifier(n_estimators = 600)

    clf =GridSearchCV(gb,gb_grid_params,cv=2)
    clf.fit(x, y)

  
    
    print("=================================== best parameters ===============================")
    print(" best prameter for".format(x,y))
    print(clf.best_params_)


#optiGridGB(Xtrain,ytrain)
#optiGridGB(Xtrain_w,ytrain)

# Attach the output file "XXX_bov_y_test_sst.txt" to your deliverable (where XXX = the name of the classifier)
#results gradient boosting used
clf = GradientBoostingClassifier(learning_rate=0.1, n_estimators=600, subsample=0.8)
clf.fit(Xtrain, ytrain)
pred_gb=clf.predict(Xtest)

np.savetxt(os.path.join(PATH_TO_DATA, "gb_bov_y_test_sst.txt"),pred_gb,fmt='%s',newline=os.linesep)


Testing RandomForestClassifier
Learing time 1.5050480365753174s
Predicting time 0.0s
 score0.29155313351498635

Testing GradientBoostingClassifier
Learing time 148.03811192512512s
Predicting time 0.03125119209289551s
 score0.3723887375113533


In [88]:
clf = GradientBoostingClassifier()
model=clf.fit(Xtrain, ytrain)
pred_gb=model.predict(Xtest)

np.savetxt(os.path.join(PATH_TO_DATA, "gb_bov_y_test_sst.txt"),pred_gb,fmt='%s',newline=os.linesep)

# 4) Sentence classification with LSTMs in Keras

## 4.1 - Preprocessing

In [70]:
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import one_hot
from keras.utils import to_categorical
# fix random seed for reproducibility
np.random.seed(7)

In [78]:
# 1 - Load train/dev/test sets of SST
K.clear_session()

def load_sentences_txt(path):
    sentences = []
    with open(path, 'r', encoding = 'utf-8') as input_file:
        for line in input_file:
            sentences += [str(line).replace("'b", "")
                          .replace("\n", "")[2:]]
    input_file.close()
    return sentences


PATH_TO_DATA = "data/"
#64 sentences for one batch
test_lstm = load_sentences_txt(os.path.join(PATH_TO_DATA, "stsa.fine.test.X"))
dev_lstm = load_sentences_txt(os.path.join(PATH_TO_DATA, "stsa.fine.dev"))
train_lstm = load_sentences_txt(os.path.join(PATH_TO_DATA, "stsa.fine.train"))


In [79]:
# 2 - Transform text to integers using keras.preprocessing.text.one_hot function
#     https://keras.io/preprocessing/text/

vocab=[]
for sent in train_lstm :
    for word in sent.split(' '):
            vocab.append(word)
for sent in dev_lstm :
    for word in sent.split(' '):
            vocab.append(word)
            

#vocab size
vocab_size= len(set(vocab))
print(vocab_size)

# integer encode the document
train_encode = [one_hot(sent, round(vocab_size*1.3))for sent in train_lstm]
test_encode = [one_hot(sent, round(vocab_size*1.3))for sent in test_lstm]
dev_encode = [one_hot(sent, round(vocab_size*1.3))for sent in dev_lstm]

print(train_encode[0])
print(test_encode[0])
print(dev_encode[0])


17613
[3949, 2287, 17879, 18817, 1022, 6093, 16608, 5050, 18709, 7692, 18817, 18326, 13693, 18817, 4322, 4511, 7283]
[18148, 19132, 17537, 19455, 12624, 18709, 12784]
[883, 11977, 5214, 20857, 12659, 18326, 18778, 5045, 15781, 3949, 6117, 2765, 17224, 7255, 517, 13861, 21007]


**Padding input data**

Models in Keras (and elsewhere) take batches of sentences of the same length as input. It is because Deep Learning framework have been designed to handle well Tensors, which are particularly suited for fast computation on the GPU.

Since sentences have different sizes, we "pad" them. That is, we add dummy "padding" tokens so that they all have the same length.

The input to a Keras model thus has this size : (batchsize, maxseqlen) where maxseqlen is the maximum length of a sentence in the batch.

In [80]:
# 3 - Pad your sequences using keras.preprocessing.sequence.pad_sequences
#     https://keras.io/preprocessing/sequence/

X_train = sequence.pad_sequences(train_encode)
maxseqlen=X_train.shape[1]
X_train = sequence.pad_sequences(train_encode, maxlen=maxseqlen)
X_val = sequence.pad_sequences(dev_encode, maxlen=maxseqlen)
X_test = sequence.pad_sequences(test_encode, maxlen=maxseqlen)

#labels in dummies
y_val=to_categorical(ydev)
y_train=to_categorical(ytrain)

## 4.2 - Design and train your model

In [81]:
# 4 - Design your encoder + classifier using keras.layers
#     In Keras, Torch and other deep learning framework, we create a "container" which is the Sequential() module.
#     Then we add components to this contained : the lookuptable, the LSTM, the classifier etc.
#     All of these components are contained in the Sequential() and are trained together.


from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Activation

embed_dim  = 32  # word embedding dimension
nhid       = 64  # number of hidden units in the LSTM
vocab_size = 0  # size of the vocabulary
n_classes  = 5

model = Sequential()
model.add(Embedding(vocab_size, embed_dim))
model.add(LSTM(nhid, dropout_W=0.2, dropout_U=0.2))
model.add(Dense(n_classes, activation='sigmoid'))

#adapted
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Activation

embed_dim  = 32  # word embedding dimension
nhid       = 64  # number of hidden units in the LSTM
vocab_size = len(vocab)  # size of the vocabulary
n_classes  = 5

model = Sequential()
model.add(Embedding(vocab_size, embed_dim))
model.add(LSTM(nhid, dropout_W=0.2, dropout_U=0.2))
model.add(Dense(n_classes, activation='softmax'))#use softmax instead of sigmoid for a better accuracy




In [82]:
# 5 - Define your loss/optimizer/metrics

loss_classif     =  'binary_crossentropy' # find the right loss for multi-class classification
optimizer        =  'adam' # find the right optimizer
metrics_classif  =  ['accuracy']

# Observe how easy (but blackboxed) this is in Keras
model.compile(loss=loss_classif,
              optimizer=optimizer,
              metrics=metrics_classif)
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 32)          5914784   
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                24832     
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 325       
Total params: 5,939,941
Trainable params: 5,939,941
Non-trainable params: 0
_________________________________________________________________
None


In [59]:
# 6 - Train your model and find the best hyperparameters for your dev set
#     you will be evaluated on the quality of your predictions on the test set


bs = 64
n_epochs = 4

history = model.fit(X_train, y_train, batch_size=bs, epochs=n_epochs, validation_data=(X_val, y_val))

#Plotting the evolution of train/dev results w.r.t the number of epochs
import matplotlib.pyplot as plt
import numpy
# list all data in history
print(history.history.keys())
# summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.subplot()
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

Train on 8544 samples, validate on 1101 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
dict_keys(['val_loss', 'val_acc', 'loss', 'acc'])


<matplotlib.figure.Figure at 0x1702f4ce0b8>

<matplotlib.figure.Figure at 0x170323535c0>

In [60]:
# 7 - Generate your predictions on the test set using model.predict(x_test)
#     https://keras.io/models/model/


scores = model.evaluate(X_val, y_val)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

#     Log your predictions in a file (one line = one integer: 0,1,2,3,4)
#     Attach the output file "logreg_lstm_y_test_sst.txt" to your deliverable.

prediction=model.predict(X_test)
np.savetxt(os.path.join(PATH_TO_DATA, "logreg_lstm_y_test_sst.txt"),prediction,fmt='%.18g',newline=os.linesep)


acc: 75.57%


In [77]:
K.clear_session()

## 4.3 -- innovate !

In [85]:
# 8 - Open question: find a model that is better on your dev set
#     (e.g: use a 1D ConvNet, use a better classifier, pretrain your lookup tables ..)
#     you will get point if the results on the test set are better: be careful of not overfitting your dev set too much..

K.clear_session()
from keras.layers import Dense, Dropout, Activation
from keras.layers import Conv1D, MaxPooling1D
embed_dim  = 32  # word embedding dimension
nhid       = 64  # number of hidden units in the LSTM

vocab=[]
for sent in train_lstm :
    for word in sent.split(' '):
            vocab.append(word)
for sent in dev_lstm :
    for word in sent.split(' '):
            vocab.append(word)
            
vocab_size = len(set(vocab))  # size of the vocabulary
n_classes  = 5
bs = 64
n_epochs = 6


# Convolution
kernel_size = 3
filters = 32
pool_size = 4
# create the model

print('Build model...')

model2 = Sequential()
model2.add(Embedding(vocab_size, embed_dim))
model2.add(Dropout(0.25))
model2.add(Conv1D(filters,
                 kernel_size,
                 padding='same',
                 activation='relu',
                 strides=1))
model2.add(MaxPooling1D(pool_size=pool_size))
model2.add(LSTM(nhid))
model2.add(Dense(5, activation= 'softmax'))
model2.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
model2.fit(X_train, y_train,
          batch_size=bs,
          epochs=n_epochs,
          validation_data=(X_val, y_val))
print('Test score:', score)
print('Test accuracy:', acc)
scores = model2.evaluate(X_val, y_val)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

#     Attach the output file "XXX_XXX_y_test_sst.txt" to your deliverable.
np.savetxt(os.path.join(PATH_TO_DATA, "innovate_y_test_sst.txt"),pred,fmt='%s',newline=os.linesep)

Build model...
Train...
Train on 8544 samples, validate on 1101 samples
Epoch 1/6


InvalidArgumentError: indices[56,41] = 19197 is not in [0, 17613)
	 [[Node: embedding_1/Gather = Gather[Tindices=DT_INT32, Tparams=DT_FLOAT, validate_indices=true, _device="/job:localhost/replica:0/task:0/device:CPU:0"](embedding_1/embeddings/read, embedding_1/Cast)]]

Caused by op 'embedding_1/Gather', defined at:
  File "C:\Users\izaou\Anaconda3\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\Users\izaou\Anaconda3\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\Users\izaou\Anaconda3\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\Users\izaou\Anaconda3\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance
    app.start()
  File "C:\Users\izaou\Anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 478, in start
    self.io_loop.start()
  File "C:\Users\izaou\Anaconda3\lib\site-packages\zmq\eventloop\ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "C:\Users\izaou\Anaconda3\lib\site-packages\tornado\ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "C:\Users\izaou\Anaconda3\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\Users\izaou\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "C:\Users\izaou\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "C:\Users\izaou\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "C:\Users\izaou\Anaconda3\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\Users\izaou\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "C:\Users\izaou\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "C:\Users\izaou\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "C:\Users\izaou\Anaconda3\lib\site-packages\ipykernel\ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "C:\Users\izaou\Anaconda3\lib\site-packages\ipykernel\zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "C:\Users\izaou\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2728, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "C:\Users\izaou\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2850, in run_ast_nodes
    if self.run_code(code, result):
  File "C:\Users\izaou\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-85-81069fbe0029>", line 34, in <module>
    model2.add(Embedding(vocab_size, embed_dim))
  File "C:\Users\izaou\Anaconda3\lib\site-packages\keras\models.py", line 467, in add
    layer(x)
  File "C:\Users\izaou\Anaconda3\lib\site-packages\keras\engine\topology.py", line 619, in __call__
    output = self.call(inputs, **kwargs)
  File "C:\Users\izaou\Anaconda3\lib\site-packages\keras\layers\embeddings.py", line 138, in call
    out = K.gather(self.embeddings, inputs)
  File "C:\Users\izaou\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py", line 1211, in gather
    return tf.gather(reference, indices)
  File "C:\Users\izaou\Anaconda3\lib\site-packages\tensorflow\python\ops\array_ops.py", line 2585, in gather
    params, indices, validate_indices=validate_indices, name=name)
  File "C:\Users\izaou\Anaconda3\lib\site-packages\tensorflow\python\ops\gen_array_ops.py", line 2334, in gather
    validate_indices=validate_indices, name=name)
  File "C:\Users\izaou\Anaconda3\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "C:\Users\izaou\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 3160, in create_op
    op_def=op_def)
  File "C:\Users\izaou\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 1625, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

InvalidArgumentError (see above for traceback): indices[56,41] = 19197 is not in [0, 17613)
	 [[Node: embedding_1/Gather = Gather[Tindices=DT_INT32, Tparams=DT_FLOAT, validate_indices=true, _device="/job:localhost/replica:0/task:0/device:CPU:0"](embedding_1/embeddings/read, embedding_1/Cast)]]
