<center><h1>Base method</h1></center>
<center><h1>Text combining using MLM and Word embeddings</h1></center>

## 0) Import the libraries

In [12]:
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.feature_extraction.text import CountVectorizer
import re
import spacy
import transformers
from transformers import pipeline
import warnings
from transformers import logging
logging.set_verbosity_error()
warnings.filterwarnings('ignore')

In [13]:
print("numpy version : "+np.__version__)
print("pandas version : "+pd.__version__)
print("sklearn version : "+sk.__version__)
print("spacy version : "+spacy.__version__)
print("transformers version : "+transformers.__version__)

numpy version : 1.24.1
pandas version : 1.5.3
sklearn version : 1.2.1
spacy version : 3.5.0
transformers version : 4.26.1


## 1) Text Data representation and preprocessing

In [14]:
sentence1 = "I love to play video games in my free thyme"
sentence2 = "I live to play oreo games in my free time"
sentence1, sentence2

('I love to play video games in my free thyme',
 'I live to play oreo games in my free time')

In [15]:
# Bag of Words
# Create a CountVectorizer object
count_vectorizer = CountVectorizer()

# Fit and transform the processed titles
count_train = count_vectorizer.fit_transform([sentence1, sentence2])

# Visualise the sparse matrix
count_df = pd.DataFrame(count_train.A, columns=count_vectorizer.get_feature_names_out())
count_df

Unnamed: 0,free,games,in,live,love,my,oreo,play,thyme,time,to,video
0,1,1,1,0,1,1,0,1,1,0,1,1
1,1,1,1,1,0,1,1,1,0,1,1,0


In [16]:
# getting the uncommon parts
uncommon_words = count_df.loc[:, (count_df.sum(axis=0) < 2)].columns.tolist()
uncommon_words

['live', 'love', 'oreo', 'thyme', 'time', 'video']

In [17]:
# masking the uncommon words
big_regex = re.compile('|'.join(map(re.escape, uncommon_words)))
masked_sentence1 = big_regex.sub("[MASK]", sentence1)
masked_sentence2 = big_regex.sub("[MASK]", sentence2)
masked_sentence1, masked_sentence2, masked_sentence1 == masked_sentence1

('I [MASK] to play [MASK] games in my free [MASK]',
 'I [MASK] to play [MASK] games in my free [MASK]',
 True)

### 2) Using Masked Language Model with BERT

In [18]:
fill_mask = pipeline("fill-mask", model="bert-base-uncased")
pred = fill_mask(masked_sentence1)
pred

[[{'score': 0.24123285710811615,
   'token': 2109,
   'token_str': 'used',
   'sequence': '[CLS] i used to play [MASK] games in my free [MASK] [SEP]'},
  {'score': 0.11024598032236099,
   'token': 2066,
   'token_str': 'like',
   'sequence': '[CLS] i like to play [MASK] games in my free [MASK] [SEP]'},
  {'score': 0.09037332981824875,
   'token': 2018,
   'token_str': 'had',
   'sequence': '[CLS] i had to play [MASK] games in my free [MASK] [SEP]'},
  {'score': 0.07627112418413162,
   'token': 2359,
   'token_str': 'wanted',
   'sequence': '[CLS] i wanted to play [MASK] games in my free [MASK] [SEP]'},
  {'score': 0.04367785528302193,
   'token': 4669,
   'token_str': 'liked',
   'sequence': '[CLS] i liked to play [MASK] games in my free [MASK] [SEP]'}],
 [{'score': 0.26360970735549927,
   'token': 2678,
   'token_str': 'video',
   'sequence': '[CLS] i [MASK] to play video games in my free [MASK] [SEP]'},
  {'score': 0.07980097830295563,
   'token': 2070,
   'token_str': 'some',
   'se

## 3) Combining the sentences using semantic similarity with word embeddings

In [19]:
# get the uncommon words in the two sentences ordered by their appearance in sentence1  and sentence2
def get_uncommon_words(sentence1, sentence2):
    # get the words in sentence1 and sentence2
    words1 = sentence1.split()
    words2 = sentence2.split()
    # get the uncommon words in the two sentences ordered by their appearance in sentence1  and sentence2
    uncommon_words = [[word for word in words1 if word not in words2], [word for word in words2 if word not in words1]]
    return uncommon_words

In [20]:
nlp = spacy.load("en_core_web_md")  # Load pre-trained word embeddings
uncommon_words = get_uncommon_words(sentence1, sentence2)
selected_words = []
for i in range(len(pred)):
    df1 = pd.DataFrame(pred[i])
    word_list = df1["token_str"].tolist()
    string1 = uncommon_words[0][i]
    string2 = uncommon_words[1][i]

    # Calculate the semantic similarity between each string and the words in the list
    similarity1 = np.mean([nlp(w).similarity(nlp(string1)) for w in word_list])
    similarity2 = np.mean([nlp(w).similarity(nlp(string2)) for w in word_list])

    if similarity1 > similarity2:
        selected_words.append(string1)
    else:
        selected_words.append(string2)

combined_sentence = masked_sentence1
for index in range(len(selected_words)):
    combined_sentence = combined_sentence.replace("[MASK]", selected_words[index], 1)

## 4) Full function

In [21]:
def text_combining(sentence1, sentence2):
    count_vectorizer = CountVectorizer()
    count_train = count_vectorizer.fit_transform([sentence1, sentence2])
    count_df = pd.DataFrame(count_train.A, columns=count_vectorizer.get_feature_names_out())

    uncommon_words = count_df.loc[:, (count_df.sum(axis=0) < 2)].columns.tolist()

    big_regex = re.compile('|'.join(map(re.escape, uncommon_words)))
    masked_sentence = big_regex.sub("[MASK]", sentence1)
    
    fill_mask = pipeline("fill-mask", model="bert-base-uncased")
    pred = fill_mask(masked_sentence)

    nlp = spacy.load("en_core_web_md")  
    uncommon_words_per_sentence = get_uncommon_words(sentence1, sentence2)
    selected_words = []
    for i in range(len(pred)):
        df1 = pd.DataFrame(pred[i])
        word_list = df1["token_str"].tolist()
        string1 = uncommon_words_per_sentence[0][i]
        string2 = uncommon_words_per_sentence[1][i]

        similarity1 = np.mean([nlp(w).similarity(nlp(string1)) for w in word_list])
        similarity2 = np.mean([nlp(w).similarity(nlp(string2)) for w in word_list])

        if similarity1 > similarity2:
            selected_words.append(string1)
        else:
            selected_words.append(string2)

    combined_sentence = masked_sentence
    for index in range(len(selected_words)):
        combined_sentence = combined_sentence.replace("[MASK]", selected_words[index], 1)

    print("Original Sentence 1: ", sentence1)
    print("Original Sentence 2: ", sentence2)
    print("Combined Sentence: ", combined_sentence)

text_combining(sentence1, sentence2)

Original Sentence 1:  I love to play video games in my free thyme
Original Sentence 2:  I live to play oreo games in my free time
Combined Sentence:  I love to play video games in my free time
