Firstly, we start by importing the text document and format the sentences.

In [None]:
file = open('../word_vectors/sample.txt')

document = file.read()
lines = preprocess(document)

def preprocess(document):
    """
    Do the preprocessing of the document such as removing numbers, punctuation,
    delimiting sentences, tokenizing words.
    :param document: string document
    :return: list of lines with every token words.
    """
    document = document.lower()  # Put it to lowercase
    document = document.strip()  # White space removal
    document = re.sub(r'\d+', '', document)  # Remove numbers

    # Remove all punctuation (we change the breakline in 1 before)
    document = document.replace('\n', '1')
    document = re.sub('\W+', ' ', document)
    lines = document.split('1')   # We get the different sentences.
    print("Preprocessing is finished.")

    return lines

We also do the tokenization and the stemming / lemmatization. In this example, we chose to not use them because
the evaluation contains many words conjugated.

In [None]:
def tokenization(lines, stemming=False, lemmatization=False):
    """
    Do the tokenization of every word, do the lemmatization and stemming
    operation as well.

    :param lines: A list of strings. Each element represents a sentence.
    :return: List of string of words. Each element represent a sentence.
    """
    stop_words = set(stopwords.words('english'))

    for i in range(len(lines)):
        lines[i] = lines[i].split()  # Tokenize the lines
        words_selected = [j for j in lines[i] if j not in stop_words]  # Remove stop words
        lines[i] = words_selected
        stemmer = PorterStemmer()  # Define the stemming operation
        lemmatizer = WordNetLemmatizer()  # Define the lemmatization
        for k in range(len(lines[i])):
            if stemming:
                lines[i][k] = stemmer.stem(lines[i][k])
            if lemmatization:
                lines[i][k] = lemmatizer.lemmatize(lines[i][k])
    print("Tokenization is finished.")
    return lines

lines = tokenization(lines)

For the word embedding, we are using Word2Vec from the Gensim package. It helps to get the vectorial representation of all the token words.

In [None]:
model = Word2Vec(test, size=100, window=5, min_count=1, workers=4)
print("Training of the Word2Vec is finished.")
model.save("word2vec.model")


word2vec_model = Word2Vec.load("word2vec.model")

The goal of our project is to try to approach and replace the unknown words with a correct equivalent. 
Firstly, we have to create a train set which contains many unknown words.

In [None]:
def replace_with_unknown(lines):
    """
    Replace random words with the UNKNOW tag, store all the words in a list.
    :param lines:
    :return: List of all the real words and new lines with tags.
    """
    real_words = []
    unk_lines = lines.copy()
    for j in range(len(lines)):
        line = lines[j]
        if len(line) >= 5:  # We want the unknown word in the middle of five words
            index = np.random.random_integers(low=2, high=len(line) - 3)  # Take the index of the word to be choosen
            real_words.append(line[index])
            unk_lines[j][index] = 'unk'  # Replace it with the unknown tag
    print("Creation of the unknown dataset.")
    return unk_lines, real_words



# First of all we will do the replacement with UNK.

unk_test, real_words = predict_unk.replace_with_unknown(test)

Then a first approach to predict the unknown words will be to take four words arounds and take the most similar word. We call this method the prediction with nearest neighbours: 

In [None]:
def prediction_neighbor_similarities(lines_with_unknown, model):
    """
    Return the prevision for each unknown word based on the similarities with the nearest neighbours.
    :param lines_with_unknown: list of string lines.
    :param model: the model of Word2Vec
    :return: list of the predicted words
    """
    predicted_words = []
    for i in range(len(lines_with_unknown)):
        line = lines_with_unknown[i]
        if 'unk' in line:  # If the line contains the word 'unk'
            index = line.index('unk')
            neighbours_words = [line[i] for i in
                                (index - 2, index - 1, index + 1, index + 2)]  # Extract the words around
            most_similar = model.most_similar(positive=neighbours_words)[0][0]
            predicted_words.append(most_similar)

    return predicted_words

We can then apply this to our dataset and evaluate its performance:

In [None]:
unk_lines, real_words = predict_unk.replace_with_unknown(lines)

# We have now the train set.

# Let's try it with the Nearest-Neighbor

predicted_words = predict_unk.prediction_neighbor_similarities(unk_lines, word2vec_model)

df = pd.DataFrame()

df['Real Words'] = real_words
df['Nearest Neighbours'] = predicted_words

df['Similarity'] = df.apply(lambda row: word2vec_model.wv.similarity(row['Real Words'], row['Nearest Neighbours']),
                            axis=1)

We then had the idea to use the Part-of-Speech (POS) tag enhancement. It says if a word is a verb, noun, etc...

We believe that can help to make accurate prediction because suppose we have the sentence:

"I want to unk football and basketball".

Maybe the most similar words will be "tennis" or "sports". But here, thanks to the context, we know that the unk word is a verb, and then we can decide to replace the unknown word with a verb which is similar.

Rather, than trying to enumerate all the grammar rules, we decided to use a Neural Network (LSTM) to capture the structure of the sentence.

For example, if we have "Noun" "to" "unk" "Noun" "Noun" ; the neural network will say that the POS of the "unk" word is more likely to be a verb (because there is a "to" before...).

We use the NLTK package to do an automatic POS tagging with the known words.

In [None]:
# First we need to create a list of all the POS tags

def assign_POS_tag(lines):
    """
    Assign the POS tags to each lines. If the word is not recognized, we use a special tag "NULL".
    :param lines: list of lines
    :return: lines with the POS tag
    """
    lines_POS = lines.copy()
    for i in range(len(lines)):
        line = lines[i]
        association = nltk.pos_tag(line)  # Get the list of tokens with their POS tags
        lines_POS[i] = [x[1] for x in association]
    print("Creation of the POS tags list is finished.")

    return lines_POS

lines_POS = assign_POS_tag(lines)

We then create a simple NN (with a LSTM layer) to capture the temporal dependency of the sentence.
Moreover, we output the density of probability for each category (there are 45 possible POS tag) and the loss function will be the categorical cross entropy.

In [3]:
def create_neural_network():
    """
    Create the LSTM Sequential Neural Network.
    :return: A Neural network created with Keras.
    """
    model = Sequential()
    model.add(LSTM(32, input_shape=(4, 45)))  # 4 time-steps and 45 features
    model.add(Dense(64))
    model.add(Activation('tanh'))
    model.add(Dense(units=45))  # 45 is the number of class
    model.add(Activation('softmax'))  # Output the density of probability

    model.compile(optimizer=adam(lr=0.001, decay=1e-6),
                  loss="categorical_crossentropy",
                  metrics=['accuracy'])

    model.summary()
    print("Creation of the Neural Network is finished.")
    return model

# We use a neural network (with Keras)

NN_model = create_neural_network()

We will then create the X and Y to feed the Neural Network, because the NN only takes numbers, we will before do a
one-hot encoding to every word.

Then we will take a sequence of 4 words to predict the fifth one (which is in the middle of the sentence).

In [None]:
def one_hot_encoding(POS_tag, list_tags):
    """
    Transform the POS tag to the classification array.
    :param POS_tag: a string
    :return: the array with one-hot encoding.
    """
    if POS_tag in list_tags:
        position = list_tags.index(POS_tag)  # Take the position of the word
    else:
        position = np.random.randint(len(list_tags) - 1)
    class_array = np.zeros(len(list_tags))
    class_array[position] = 1  # Assign one to the correct class
    return class_array


def convert_int_data(lines):
    """
    Convert the lines with tags to array of 0 and 1 for the neural network.
    :param lines: the lines with tags indices.
    :param tag_list: the list of all possible tags.
    :return: X and Y sets ready for the neural network.
    """
    tagdict = load('help/tagsets/upenn_tagset.pickle')
    list_tags = list(tagdict.keys())  # Get the list of all the tags.
    X, Y = [], []  # Creation of the array
    for j in range(len(lines)):
        line = lines[j]
        if len(line) >= 5:  # We want the word in the middle of five words
            index = np.random.random_integers(low=2, high=len(line) - 3)  # Take the index of the word to be choosen
            neighbours_words = [line[i] for i in (index - 2, index - 1, index + 1, index + 2)]  # Extract the words
            Y.append(one_hot_encoding(lines[j][index], list_tags))  # Append the target to the array
            sample = []
            for word in neighbours_words:
                sample.append(one_hot_encoding(word, list_tags).tolist())
            X.append(sample)  # Append the 4 neighbouring words

    return np.array(X), np.array(Y)

X_train, Y_train = convert_int_data(train_set)
X_test, Y_test = convert_int_data(test_set)

We can the fit and train the neural network:

In [None]:
NN_model.fit(X_train, Y_train)
print("Training of the Neural Network is finished.")
Y_pred = NN_model.predict(X_test)

The accuracy is calculated, we compare the class prediction with the real class:

In [None]:
def compute_accuracy(Y_test, Y_pred):
    """
    Compute the accuracy between the prediction and the actual classification of the word.
    :param Y_test: the real POS tag of the word.
    :param Y_pred: the predicted POS tag of the word.
    :return: the percentage of correct predictions.
    """
    number_correct_prediction = 0
    for i in range(len(Y_pred)):  # They have the same length
        id_pred = np.argmax(Y_pred[i])  # Take the argmax of the prediction
        id_test = np.where(Y_test[i] == 1.)[0][0]  # Take the real position of the POS tag
        if id_test == id_pred:
            number_correct_prediction += 1

    percentage_correct = number_correct_prediction / len(Y_pred)

    return percentage_correct

accuracy = compute_accuracy(Y_test, Y_pred)

We believe that the accuracy is not quite good because we have included too many categories (45) where we can restrict to less. Moreover, the lack of data and the structure of the neural network (architecture and hyperparameters) might be not the most appropriate. 

However, we believe that this method is quite useful and can lead to good results if correctly trained.

We then do the prediction with neighbours but with the inclusion of POS tag.
We select the top 10 most similars words and then we check if the POS of one of them correspond to the POS predicted by the LSTM network. If so, we select that word, otherwise, we select the one with the most important similarity.

In [None]:
def prediction_neighbor_with_pos(lines_with_unknown, word2vec_model, NN_model):
    """
    Return the prevision for each unknown word based on the similarities with the nearest neighbours.
    This time, we used the lstm pos model to select the word based on its POS tag.
    :param lines_with_unknown: list of string lines with token words.
    :param word2vec_model: the word2vec model that we used to get similar words.
    :param lstm_model: we use this one to predict the pos tag of the word.
    :return: predicted words.
    """
    predicted_words = []  # List of all the predicted words
    tagdict = load('help/tagsets/upenn_tagset.pickle')
    list_tags = list(tagdict.keys())  # Get the list of all the tags.
    for i in range(len(lines_with_unknown)):
        line = lines_with_unknown[i]
        if 'unk' in line:  # If the line contains the word 'unk'
            index = line.index('unk')
            neighbours_words = [line[i] for i in
                                (index - 2, index - 1, index + 1, index + 2)]  # Extract the words around
            most_similar_list = word2vec_model.most_similar(positive=neighbours_words)[:10]
            sample = []

            for word in neighbours_words:  # Format the neighbouring words for the Neural Network
                sample.append(one_hot_encoding(word, list_tags).tolist())

            Y_pos = NN_model.predict(np.array(sample).reshape((1, 4, 45)))  # Predict the vector of POS tag
            id_pos = np.argmax(Y_pos)  # Take the id
            pos_tag = list_tags[
                id_pos]  # We got now the POS tag which is predicted, we can get a more accurate prediction

            # We then check if there is a word if the corresponding POS tag among the top 10,

            best_candidate = []
            for i in range(len(most_similar_list)):
                word = most_similar_list[i][0]
                if nltk.pos_tag([word]) == pos_tag:
                    best_candidate.append(word)

            if best_candidate:  # If the list is not empty
                predicted_words.append(best_candidate[0])  # Take the first element
            else:
                predicted_words.append(most_similar_list[0][0])  # Otherwise we just take the first element

    return predicted_words

df['LSTM enhancement'] = predict_unk.prediction_neighbor_with_pos(unk_test, word2vec_model, NN_model)

We can then apply this method on the unknown list, to tag the unknown words and try to detect them.

Finally, we calculate the similarity (and Spearman ranking) and the analogy score on the two dataset:

In [None]:
def spearman(model):
    """
    Calculate the similarities between each words and perform the Spearman ranking.
    """
    df_sim = pd.read_csv('../word_vectors/wordsim353.csv')

    # Apply the similarity function to each row
    df_test = df_sim.copy()  # Test dataframe
    df_test['Similarity'] = df_test.apply(lambda row: model.wv.similarity(row['Word 1'], row['Word 2']),
                                          axis=1)

    spearman_rank = spearmanr(df_test['Human (mean)'], df_test['Similarity']).correlation
    df_test['Spearman'] = spearman_rank
    print("Spearman ranking between similarities is finished. Value is " + str(spearman_rank))

    return df_test


def analogy(model):
    """
    Get the analogy for each phrase.
    :param model: The Word2Vec model which has been trained.
    :return:
    """

    df_analogy = pd.read_csv('../word_vectors/questions-words.txt', sep=" ", header=None, skiprows=1)
    df_analogy = df_analogy.dropna()  # Drop rows with NaN
    df_analogy.columns = ['Word 1', 'Analogy 1', 'Word 2', 'Analogy 2']
    df_analogy = df_analogy.applymap(lambda s: s.lower() if type(s) == str else s)  # Convert to lowercase

    df_test = df_analogy.copy()  # Get the test dataframe

    # For each line, we find the analogy and we write in the column prediction.
    df_test['Prediction'] = df_test.apply(lambda row: model.most_similar(positive=[row['Word 2'], row['Analogy 1']],
                                                                         negative=[row['Word 1']])[0][0], axis=1)
    print("Computation of the analogies is finished.")

    return df_test


df_sim = spearman(model)

df_analogy = analogy(model)

Marin B0UTHEMY & Kossi NEROMA, ENSAE 3A