# Import

In [1]:
from typing import Callable, List, Any, Tuple

import path_imports

import numpy as np

from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split
from src.read_corpus import read_corpus
from gensim.models import KeyedVectors
from src.preprocessing.regexp_tokenizer import RegexpTokenizer


[nltk_data] Downloading package wordnet to /home/secouss/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Load dataset

### Read csv

In [15]:
df = read_corpus(categories=["debate"], presidents=["Barack Obama", "Donald Trump", "Joe Biden"])
df

100%|██████████| 29/29 [00:03<00:00,  8.49it/s]


Unnamed: 0,category,name,date,text
0,debate,Barack Obama,2012-10-03 09:00:00,"Well, thank you very much, Jim, for this oppor..."
1,debate,Barack Obama,2012-10-03 09:00:00,"Well, let me talk specifically about what I th..."
2,debate,Barack Obama,2012-10-03 09:00:00,"So all of this is possible. Now, in order for ..."
3,debate,Barack Obama,2012-10-03 09:00:00,"Well, I think — let’s talk about taxes, becaus..."
4,debate,Barack Obama,2012-10-03 09:00:00,When you add up all the loopholes and deductio...
...,...,...,...,...
2245,debate,Joe Biden,2020,"Every single solitary generation, the dial has..."
2247,debate,Joe Biden,2020,We’d better be able to do it again.
2249,debate,Joe Biden,2020,"Well, it could say I’m a lousy candidate, and ..."
2251,debate,Joe Biden,2020,"Yeah. And by the way, before I came up here, I..."


### Preprocess

In [17]:
# Add a sapce after each sentence
df["text"] = df["text"].str.replace(".", ". ", regex=False)

# all lower
df["text"] = df["text"].str.lower()

# show one word
df.iloc[0]["text"]

'well, thank you very much, jim, for this opportunity.   i want to thank governor romney and the university of denver for your hospitality.  there are a lot of points i want to make tonight, but the most important one is that 20 years ago i became the luckiest man on earth because michelle obama agreed to marry me.  and so i just want to wish, sweetie, you happy anniversary and let you know that a year from now we will not be celebrating it in front of 40 million people.  you know, four years ago we went through the worst financial crisis since the great depression.   millions of jobs were lost, the auto industry was on the brink of collapse.   the financial system had frozen up.  and because of the resilience and the determination of the american people, we’ve begun to fight our way back.   over the last 30 months, we’ve seen 5 million jobs in the private sector created.   the auto industry has come roaring back.   and housing has begun to rise.  but we all know that we’ve still got a

## Word2vec + Tokenizer 

In [18]:
import gensim.downloader
#model_word2vec_path = '../word2vec/word2vecBest.model'
#word2vec_model: Word2Vec = Word2Vec.load(model_word2vec_path)

word2vec_model : KeyedVectors = gensim.downloader.load('glove-wiki-gigaword-300')

In [19]:
tokenizer: Callable[[str], List[str]] = RegexpTokenizer().lemma_tokenize

## Prepare input

### Tokenize each word

In [20]:
df["tokens"] = df['text'].apply(tokenizer)
df

Unnamed: 0,category,name,date,text,tokens
0,debate,Barack Obama,2012-10-03 09:00:00,"well, thank you very much, jim, for this oppor...","[well, ,, thank, you, very, much, ,, jim, ,, f..."
1,debate,Barack Obama,2012-10-03 09:00:00,"well, let me talk specifically about what i th...","[well, ,, let, me, talk, specifically, about, ..."
2,debate,Barack Obama,2012-10-03 09:00:00,"so all of this is possible. now, in order fo...","[so, all, of, this, is, possible, ., now, ,, i..."
3,debate,Barack Obama,2012-10-03 09:00:00,"well, i think — let’s talk about taxes, becaus...","[well, ,, i, think, —, let, ’, s, talk, about,..."
4,debate,Barack Obama,2012-10-03 09:00:00,when you add up all the loopholes and deductio...,"[when, you, add, up, all, the, loophole, and, ..."
...,...,...,...,...,...
2245,debate,Joe Biden,2020,"every single solitary generation, the dial has...","[every, single, solitary, generation, ,, the, ..."
2247,debate,Joe Biden,2020,we’d better be able to do it again.,"[we, ’, d, better, be, able, to, do, it, again..."
2249,debate,Joe Biden,2020,"well, it could say i’m a lousy candidate, and ...","[well, ,, it, could, say, i, ’, m, a, lousy, c..."
2251,debate,Joe Biden,2020,"yeah. and by the way, before i came up here,...","[yeah, ., and, by, the, way, ,, before, i, cam..."


### Split into n-grams

In [32]:
def extract_sequences(tokens, n) -> Tuple[List[List[Any]], List[Any]]:
    X = []
    y = []
    for i in range(len(tokens) - n):
        X.append(tokens[i:i+n])
        y.append(tokens[i+n])
    return X, y

NB_GRAM = 8
X = []
y = []
for tokens in df["tokens"]:
    v1, v2 = extract_sequences(tokens, NB_GRAM)
    X.extend(v1)
    y.extend(v2)
X = np.array(X)
y = np.array(y)

X[0], y[0], X.shape, y.shape

(array(['well', ',', 'thank', 'you', 'very', 'much', ',', 'jim'],
       dtype='<U18'),
 ',',
 (74041, 8),
 (74041,))

### Split into train and test dataset

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((59232, 8), (14809, 8), (59232,), (14809,))

### Vectorize words (for the input)

In [34]:
def vectorize_word(word : str) -> List[float]:
    if word in word2vec_model:
        return word2vec_model[word]
    return [0] * word2vec_model.vector_size

def encode_input(X):
    return np.array([[vectorize_word(e) for e in sample] for sample in X])
X_train_encoded = encode_input(X_train)
X_train_encoded.shape

(59232, 8, 300)

### Encore words to predict (output)

In [35]:
output_encoder = OneHotEncoder()
output_encoder.fit(y.reshape((-1, 1)))
def encode_y(y):
    return output_encoder.transform(
        y.reshape((-1, 1))
    ).todense()
y_train_encoded = encode_y(y_train)
y_train_encoded.shape

(59232, 4075)

In [36]:
NB_GRAM, word2vec_model.vector_size

(8, 300)

In [37]:
nb_word_voc = np.unique(y).shape[0]
nb_word_voc

4075

## Create and train our FNN

In [38]:
if True: # imports who are working at execution
    from keras.optimizers import Adam
    from keras.metrics import Precision, Recall
    from keras.layers import Dense, Embedding, Flatten, InputLayer
    from keras.models import Sequential
else: # imports that give information on packages
    from tensorflow.python.keras.metrics import Precision, Recall
    from tensorflow.python.keras.layers import Dense, Embedding, Flatten, InputLayer
    from tensorflow.python.keras.models import Sequential

### Create

In [39]:
model = Sequential([
    InputLayer(input_shape=(NB_GRAM, word2vec_model.vector_size)),
    Flatten(),
    Dense(units=256, activation='relu'),
    Dense(units=256, activation='relu'),
    Dense(units=nb_word_voc, activation='softmax')
])

opt = Adam(learning_rate=0.01)
# Compilation du modèle
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=[Precision(), Recall()])





### Train

In [42]:
# Entraînement du modèle
model.fit(X_train_encoded, y_train_encoded, epochs=30, batch_size=64, validation_data=(encode_input(X_test), encode_y(y_test)))

Epoch 1/30
[1m926/926[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 18ms/step - loss: 4.9894 - precision_1: 0.9027 - recall_1: 0.0375 - val_loss: 6.0065 - val_precision_1: 0.8569 - val_recall_1: 0.0368
Epoch 2/30
[1m926/926[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 19ms/step - loss: 4.9525 - precision_1: 0.9245 - recall_1: 0.0397 - val_loss: 5.9285 - val_precision_1: 0.8715 - val_recall_1: 0.0339
Epoch 3/30
[1m926/926[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 20ms/step - loss: 4.9917 - precision_1: 0.9143 - recall_1: 0.0367 - val_loss: 6.1462 - val_precision_1: 0.7858 - val_recall_1: 0.0352
Epoch 4/30
[1m926/926[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 19ms/step - loss: 4.9473 - precision_1: 0.9044 - recall_1: 0.0381 - val_loss: 5.8815 - val_precision_1: 0.8317 - val_recall_1: 0.0340
Epoch 5/30
[1m926/926[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 19ms/step - loss: 4.9763 - precision_1: 0.8899 - recall_1: 0.0368 - val_lo

<keras.src.callbacks.history.History at 0x7402ec062800>

## Generate

In [50]:
index = np.random.randint(0, X_train.shape[0])
input1 : np.ndarray = X_train[index].copy()
all_words = input1.copy().tolist()
print(input1)
for k in range(50):
    res = np.zeros((nb_word_voc))
    predictions : np.ndarray = model.predict(encode_input([input1]), verbose = 0)
    index = predictions.argsort()[0][-np.random.randint(1, 3)]
    res[index] = 1
    new_word = output_encoder.inverse_transform([res])[0][0]
    print(new_word, end=" ")
    input1[0:-1] = input1[1:]
    input1[-1] = new_word
    all_words.append(new_word)


['because' 'he' 'kept' 'worrying' ',' 'in' 'my' 'view']
. and , i ’ s a a a the a the the the the a the a the the a the the a a a the the the a a the the the the a the the a the the the a the the the the the the a 

In [None]:
# stuck : ’ s not the kind of leadership that they do is to simply leave such care insurance for three month , we ’ ve got to make sure that we ’ ve got to make sure that we ’ ve got to make sure that we ’ ve got to make sure that we ’ ve got to make sure that we ’ ve got to make sure that we ’ ve got to make sure that we ’ ve got to make sure that we ’ ve got to make sure that we ’ ve got to make sure 
# repeat : ',' 'I' 'Romney' ',' 'I' 'Romney' ',' 'I' 'Romney' ',' 'I' 'Romney' ',' 'I' 'Romney'',' 'I' 'Romney' ',' 'I' 'Romney'
# répéter les mots trop fréquents : , when we ’ re not a the a year the . when we think are , a the year of , i make the a year . when they have make . i ’ ’ an a year . that i ’ d , , the , a . 
# ngram 4 --> grammaticalement bon