# textsimilarity

Examples of using the textsimilarity package.

## Example 1: Get text similarity.

In [4]:
import pandas as pd 
import os, sys

#load example text corpus for comparison
data_location = os.path.join(sys.path[0], 'data', 'comparison_corpus.csv')
input_data = pd.read_csv(data_location)
comparison_corpus = input_data['phrases_to_compare'].values.tolist()
comparison_corpus

['relaxing vacation',
 'martini and chocolate',
 'wedding party',
 'bridal flowers',
 'soccer game',
 'skate park']

In [5]:
from textsimilarity import text_models, rankers

#load a text model to use for generating text embeddings
bert_model = text_models.BertBaseModel()    

#specify which model and corpus to use for comparison
cosine_sim_ranker = rankers.CosineSimilarityRanker(
                    bert_model, 
                    comparison_corpus
                    )

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
def print_closest_result(target, ranker):
    closest_phrase_cos_sim = ranker.rank_on_similarity(target)[0]
    print('------\nTarget text phrase: ', target)
    print('Most similar text phrase from corpus: ', closest_phrase_cos_sim[0])
    print('Cosine similarity score: ', closest_phrase_cos_sim[1], '\n------')

#print text phrases and cosine similarity scores
print_closest_result('massage', cosine_sim_ranker)
print_closest_result('football', cosine_sim_ranker)
print_closest_result('girls night out', cosine_sim_ranker)

RuntimeError: Can't call numpy() on Tensor that requires grad. Use tensor.detach().numpy() instead.

## Example 2: Using spell check before text similarity.

In [4]:
#example corpus with some spelling mistakes
corpus_with_misspelling = input_data['misspelled_phrases'].values.tolist()
corpus_with_misspelling

['relaxin vacatin',
 'girlz night outt',
 'weddding party',
 'bridal flowera',
 'sokcer game',
 'skate parkk']

In [5]:
from textsimilarity import clean_text

text_cleaner = clean_text.CleanText()

#correct spelling errors
spell_checked_phrases = []
for phrase in corpus_with_misspelling:
    corrected_phrase = text_cleaner.spelling_correction(phrase)
    spell_checked_phrases.append(corrected_phrase)
spell_checked_phrases

✔ Download and installation successful
You can now load the model via spacy.load('en_core_web_lg')


['relax vacation',
 'girl night out',
 'wedding party',
 'bridal flower',
 'soccer game',
 'skate park']

In [6]:
#specify which model and corpus to use for comparison
cosine_sim_ranker = rankers.CosineSimilarityRanker(
                            bert_model, 
                            spell_checked_phrases
                            )

In [7]:
#print text phrases and cosine similarity scores
print_closest_result('massage', cosine_sim_ranker)
print_closest_result('football', cosine_sim_ranker)
print_closest_result('martini and chocolate', cosine_sim_ranker)

------
Target text phrase:  massage
Most similar text phrase from corpus:  relax vacation
Cosine similarity score:  0.9411255717277527 
------
------
Target text phrase:  football
Most similar text phrase from corpus:  soccer game
Cosine similarity score:  0.9419365525245667 
------
------
Target text phrase:  martini and chocolate
Most similar text phrase from corpus:  girl night out
Cosine similarity score:  0.9597087502479553 
------


## Example 3: Using the profanity filter.

In [8]:
#example corpus containing a phrase with profanity
profane_text_phrase = 'go to hell'
comparison_corpus.append(profane_text_phrase)
comparison_corpus

['relaxing vacation',
 'girls night out',
 'wedding party',
 'bridal flowers',
 'soccer game',
 'skate park',
 'go to hell']

In [9]:
#remove phrases that contain profanity
for phrase in comparison_corpus:
    is_profane = text_cleaner.determine_text_profanity(phrase)
    if is_profane:
        comparison_corpus.remove(phrase)
comparison_corpus

['relaxing vacation',
 'girls night out',
 'wedding party',
 'bridal flowers',
 'soccer game',
 'skate park']