In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#install required libraries
!pip install gensim

In [None]:
#imports
import pandas as pd
import numpy as np
import gensim
import nltk
import re
import string
import plotly.express as px
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from itertools import chain
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE

In [None]:
pqa_l = pd.read_parquet('/content/drive/MyDrive/NLP/Assignment/PQA-L.parquet')
pqa_a = pd.read_parquet('/content/drive/MyDrive/NLP/Assignment/PQA-A.parquet')
pqa_u = pd.read_parquet('/content/drive/MyDrive/NLP/Assignment/PQA-U.parquet')

# Learn a word embedding using Gensim Word2Vec

## Pre-processing step


In [None]:
def string_dataframe(df):
    df['context'] = df['context'].apply(
        lambda x: ' '.join(x['contexts']) if isinstance(x, dict) and 'contexts' in x else str(x)
    )

    df['question'] = df['question'].apply(lambda x: str(x))
    df['long_answer'] = df['long_answer'].apply(lambda x: str(x))

    return df

pqa_l= string_dataframe(pqa_l)
pqa_a= string_dataframe(pqa_a)
pqa_u = string_dataframe(pqa_u)

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()


def preprocess(text):
    text = text.lower()  # lower case
    #text = ''.join([char for char in text if char not in string.punctuation]) # remove punctuation
    text = re.sub(r'(\d+)([a-zA-Z])', r'\1 \2', text) # separate number and words
    #words = nltk. word_tokenize(text) # tokenize
    words = text.split() #i need another way of tokenize to keep punctuation
    words = [w for w in words if w not in stop_words]    # remove stopwords
    words = [w for w in words if len(w) > 1]  # rimuovi parole troppo corte
    #words = [stemmer.stem(w) for w in words]     #  stemming
    #words = [lemmatizer.lemmatize(w) for w in words]  # lemmatization
    return " ".join(words)


def preprocess_dataset(df, columns):
    for col in columns:
        df[col] = df[col].apply(preprocess)
    return df


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
columns_to_preprocess = ['question', 'context', 'long_answer']

pqa_l = preprocess_dataset(pqa_l, columns_to_preprocess)
pqa_u = preprocess_dataset(pqa_u, columns_to_preprocess)
pqa_a = preprocess_dataset(pqa_a, columns_to_preprocess)

## save
pqa_l.to_csv('/content/drive/MyDrive/NLP/Assignment/pqa_l_clean.csv', index=False)
pqa_u.to_csv('/content/drive/MyDrive/NLP/Assignment/pqa_u_clean.csv', index=False)
pqa_a.to_csv('/content/drive/MyDrive/NLP/Assignment/pqa_a_clean.csv', index=False)


## Load clean dataset

In [None]:
pqa_l = pd.read_csv('/content/drive/MyDrive/NLP/Assignment/pqa_l_clean.csv')
pqa_u = pd.read_csv('/content/drive/MyDrive/NLP/Assignment/pqa_u_clean.csv')
pqa_a = pd.read_csv('/content/drive/MyDrive/NLP/Assignment/pqa_a_clean.csv')
#pqa_l_original = pd.read_parquet('/content/drive/MyDrive/NLP/Assignment/PQA-L.parquet')
#pqa_u_original = pd.read_parquet('/content/drive/MyDrive/NLP/Assignment/PQA-U.parquet')
#pqa_a_original = pd.read_parquet('/content/drive/MyDrive/NLP/Assignment/PQA-A.parquet')

In [None]:
import IPython
from IPython.display import display, HTML

display(HTML(pqa_l[['question', 'context', 'long_answer']].head(2).to_html()))

## Further pre-process for Word2Vec

Prepare data to fit correctly Word2Vec:
1. Unify columns into one big sentence (also removing numbers).
2. Separate text into single sentences. (to have better context information)
3. Tokenize each sentence.
4. Fit Word2Vec model


In [None]:
#1
def remove_numbers(text):
    # Rimuove tutti i numeri dal testo
    return re.sub(r'\d+', '', text)

def unify_columns_text(df, columns):
    text_list = df[columns].fillna("").apply(lambda row: " ".join(row), axis=1).tolist()
    text_list = [remove_numbers(text) for text in text_list]
    return text_list

#2
def split_sentences(text_list):
  # remove newline characters
  docs = [re.sub('\n', ' ', doc) for doc in text_list]
  # split sentences on punctuation
  sentences = [re.split('[?!.]\s', doc) for doc in docs]
  #flatten into one list of sentences (remove distinction between document)
  sentences = list(chain.from_iterable(sentences))
  return sentences

#3
def tokenize_sentences(sentences):
  tokenized_sentences = [re.sub('\W', ' ', sentence).lower().split() for sentence in sentences]
  # remove sentences that are only 1 word long
  tokenized_sentences = [sentence for sentence in tokenized_sentences if len(sentence) > 1]
  return tokenized_sentences

def preprocess_word2vec(df, columns):
    text_list = unify_columns_text(df, columns) #Step 1
    sentences = split_sentences(text_list) # Step 2
    tokenized_sentences = tokenize_sentences(sentences) #Step 3
    return tokenized_sentences


tokenized_sentences_l = preprocess_word2vec(pqa_l, ['question', 'context', 'long_answer'])
tokenized_sentences_u = preprocess_word2vec(pqa_u, ['question', 'context', 'long_answer'])
tokenized_sentences_a = preprocess_word2vec(pqa_a, ['question', 'context', 'long_answer'])

#print to visualize the result
print('tokenized_sentences type', type(tokenized_sentences_l[0]))
for sentence in tokenized_sentences_l[:10]: print(sentence)


tokenized_sentences type <class 'list'>
['mitochondria', 'play', 'role', 'remodelling', 'lace', 'plant', 'leaves', 'programmed', 'cell', 'death']
['programmed', 'cell', 'death', 'pcd', 'regulated', 'death', 'cells', 'within', 'organism']
['lace', 'plant', 'aponogeton', 'madagascariensis', 'produces', 'perforations', 'leaves', 'pcd']
['leaves', 'plant', 'consist', 'latticework', 'longitudinal', 'transverse', 'veins', 'enclosing', 'areoles']
['pcd', 'occurs', 'cells', 'center', 'areoles', 'progresses', 'outwards', 'stopping', 'approximately', 'five', 'cells', 'vasculature']
['role', 'mitochondria', 'pcd', 'recognized', 'animals', 'however', 'less', 'studied', 'pcd', 'plants']
['following', 'paper', 'elucidates', 'role', 'mitochondrial', 'dynamics', 'developmentally', 'regulated', 'pcd', 'vivo', 'a']
['single', 'areole', 'within', 'window', 'stage', 'leaf', 'pcd', 'occurring', 'divided', 'three', 'areas', 'based', 'progression', 'pcd', 'cells', 'undergo', 'pcd', 'npcd', 'cells', 'early', 

## Fitting Word2Vec model and inspecting the results

In [None]:
model_l = Word2Vec(tokenized_sentences_l, vector_size=30, min_count=5, window=10)
model_u = Word2Vec(tokenized_sentences_u, vector_size=30, min_count=5, window=10)
model_a = Word2Vec(tokenized_sentences_a, vector_size=30, min_count=5, window=10)

In [None]:
print('Vocabulary sizes l', len(model_l.wv))
print('Vocabulary sizes u', len(model_u.wv))
print('Vocabulary sizes a', len(model_a.wv))

Vocabulary sizes l 5154
Vocabulary sizes u 42577
Vocabulary sizes a 82077


We can inspect the embedding vector.

In [None]:
term = 'aspirin'
model_l.wv[term]

array([ 0.03296621, -0.04954494,  0.04169749, -0.03452522,  0.03445226,
        0.0392454 , -0.00436175, -0.00828039,  0.00996918, -0.00132985,
        0.01424457,  0.02951571, -0.01751071, -0.06317917, -0.04099447,
       -0.02153618,  0.04535486, -0.00774344, -0.01258756, -0.00066269,
       -0.03251386,  0.00156081,  0.04483754,  0.0416584 ,  0.04619638,
        0.01046319,  0.03548427,  0.02508346,  0.040889  , -0.01658843],
      dtype=float32)

As well as most similar words for each term in the vocabulary

In [None]:
#inspect most similar word
term = 'blood'
print(model_l.wv.most_similar(term))
print(model_u.wv.most_similar(term))
print(model_a.wv.most_similar(term))

[('systolic', 0.9923658967018127), ('pressure', 0.9906083345413208), ('correlation', 0.987547755241394), ('side', 0.987537145614624), ('flow', 0.9873744249343872), ('pulse', 0.9872098565101624), ('d', 0.9871268272399902), ('arterial', 0.9870368242263794), ('fasting', 0.9865679144859314), ('level', 0.9862507581710815)]
[('phlebotomy', 0.7282387614250183), ('rbcs', 0.6951645612716675), ('gas', 0.6904792189598083), ('gases', 0.6882675886154175), ('rbc', 0.6870641112327576), ('hematocrit', 0.6866135597229004), ('hemoglobin', 0.679595410823822), ('oxygen', 0.6727662086486816), ('cvp', 0.6724663376808167), ('transfusion', 0.6673768758773804)]
[('plasma', 0.6359590888023376), ('hemolysis', 0.6176166534423828), ('erpf', 0.6175201535224915), ('qmrv', 0.6090781092643738), ('circulation', 0.5985824465751648), ('arterial', 0.593052864074707), ('hydrostatic', 0.5906599164009094), ('barometric', 0.5837988257408142), ('range', 0.5804845690727234), ('hyperemia', 0.5749706625938416)]


As we can see the most similar words refer to blood pressure, heart disease and correlated concepts.



---



We can also compute cosine similarity between terms.<br>
Let's try with 'aspirin' and 'ibuprofen' which are both FANS.

In [None]:
term1= 'aspirin'
term2 = 'ibuprofen'
print(model_l.wv.similarity(term1, term2))
print(model_u.wv.similarity(term1, term2))
print(model_a.wv.similarity(term1, term2))

0.7323977
0.6478078
0.70253474


However their similarity is not really high...<br>
Probably because this two medication are usually not prescribed together, and so they do not appear frequently in the same textual context.



---



Furthermore we can 'classify' words in semantical groups and find the one that doesn't match the group.

In [None]:
term1 = 'liver'
term2 = 'heart'
term3 = 'kidney'
term4 = 'aspirin'


print('Doesn\'t match (l):', model_l.wv.doesnt_match([term1, term2, term3, term4]))
print('Doesn\'t match (u):', model_u.wv.doesnt_match([term1, term2, term3, term4]))
print('Doesn\'t match (a):', model_a.wv.doesnt_match([term1, term2, term3, term4]))

Doesn't match (l): aspirin
Doesn't match (u): aspirin
Doesn't match (a): aspirin


In this case, we provided three organs and one medication. All three model correctly identified the item that did not belong to the semantic group.



---



Another possible analysis is to show the results in a 3D plot, and highlight some correlated terms to see if they are close together.
We can do this using t-SNE for dimensionality reduction, and a library to plot the results.

In [None]:
def plot_tsne(words, model):
  vectors = model.wv[words]
  #reduce in 3d
  vectors_3d = TSNE(n_components=3, perplexity= 3).fit_transform(vectors)
  #plot results
  x, y, z = np.transpose(vectors_3d)
  r = (-400,400)
  fig = px.scatter_3d(x=x, y=y, z=z, range_x=r, range_y=r, range_z=r, text=words)
  fig.update_traces(marker=dict(size=3,line=dict(width=2)),textfont_size=10)
  fig.show()

#first we choose some related words, like organs name.
words = ['heart', 'kidney', 'lung', 'liver', 'brain']

plot_tsne(words, model_l)
plot_tsne(words, model_u)
plot_tsne(words, model_a)

To understand if these points are near each other or not (the importance is the relative position not the assolute one), I plot also 200 random points.

In [None]:
import plotly.express as px
from sklearn.manifold import TSNE
import numpy as np

def plot_tsne_context_colored(model, focus_words, n_random=200):
    vocab = list(model.wv.index_to_key)

    # key words
    focus_words = [w for w in focus_words if w in vocab]

    # random subset
    context_words = [w for w in vocab if w not in focus_words]
    sampled_words = np.random.choice(context_words, size=min(n_random, len(context_words)), replace=False).tolist()

    all_words = focus_words + sampled_words
    vectors = model.wv[all_words]
    #vectors_3d = TSNE(n_components=3, perplexity=30, init='random').fit_transform(vectors)
    vectors_3d = TSNE(n_components=3, perplexity=30, init='pca', learning_rate='auto').fit_transform(vectors)
    x, y, z = np.transpose(vectors_3d)
    labels = ['focus' if word in focus_words else 'context' for word in all_words]

    # dataframe for the plot
    import pandas as pd
    df = pd.DataFrame({
        'x': x,
        'y': y,
        'z': z,
        'word': all_words,
        'type': labels
    })

    fig = px.scatter_3d(df, x='x', y='y', z='z', color='type', text='word',
                        color_discrete_map={'focus': 'red', 'context': 'lightgray'})
    fig.update_traces(marker=dict(size=3, line=dict(width=1)), textfont_size=9)
    fig.show()

plot_tsne_context_colored(model_l, ['heart', 'kidney', 'lung', 'liver', 'brain'])


As we can see the focus words (the red ones) are near each other.



---



In conclusion, the dataset used in this analysis is quite domain-specific and not very general.  As a result, the Word2Vec embeddings are less precise compared to those trained on large, general-purpose text.