# NLP snippits

As a part a project at work investigating various various natural language processing problems. Intresting snippits and articles are documented here. 

## Sentence similarity

https://datascience.stackexchange.com/questions/23969/sentence-similarity-prediction

In [None]:
# https://github.com/fedecaccia/text_clustering/blob/master/simple_text_recognition.py
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import numpy

texts = ["This first text talks about houses and dogs",
        "This is about airplanes and airlines",
        "This is about dogs and houses too, but also about trees",
        "Trees and dogs are main characters in this story",
        "This story is about batman and superman fighting each other", 
        "Nothing better than another story talking about airplanes, airlines and birds",
        "Superman defeats batman in the last round"]

# vectorization of the texts
vectorizer = TfidfVectorizer(stop_words="english")
X = vectorizer.fit_transform(texts)
# used words (axis in our multi-dimensional space)
words = vectorizer.get_feature_names()
print("words", words)


n_clusters=3
number_of_seeds_to_try=10
max_iter = 300
number_of_process=2 # seads are distributed
model = KMeans(n_clusters=n_clusters, max_iter=max_iter, n_init=number_of_seeds_to_try, n_jobs=number_of_process).fit(X)

labels = model.labels_
# indices of preferible words in each cluster
ordered_words = model.cluster_centers_.argsort()[:, ::-1]

print("centers:", model.cluster_centers_)
print("labels", labels)
print("intertia:", model.inertia_)

texts_per_cluster = numpy.zeros(n_clusters)
for i_cluster in range(n_clusters):
    for label in labels:
        if label==i_cluster:
            texts_per_cluster[i_cluster] +=1 

print("Top words per cluster:")
for i_cluster in range(n_clusters):
    print("Cluster:", i_cluster, "texts:", int(texts_per_cluster[i_cluster])),
    for term in ordered_words[i_cluster, :10]:
        print("\t"+words[term])

print("\n")
print("Prediction")

text_to_predict = "Why batman was defeated  by superman so easy?"
Y = vectorizer.transform([text_to_predict])
predicted_cluster = model.predict(Y)[0]
texts_per_cluster[predicted_cluster]+=1

print(text_to_predict)
print("Cluster:", predicted_cluster, "texts:", int(texts_per_cluster[predicted_cluster])),
for term in ordered_words[predicted_cluster, :10]:
    print("\t"+words[term])

### Word Mover's Distance

http://proceedings.mlr.press/v37/kusnerb15.pdf
https://radimrehurek.com/gensim/models/keyedvectors.html

![](https://i.stack.imgur.com/DjJW1.png)

### Doc2vec solution

https://rare-technologies.com/doc2vec-tutorial/
You can train your doc2vec model following this link. You may want to perform some pre-processing steps like removing all stop words (words like "the", "an", etc. that don't add much meaning to the sentence). Once you trained your model, you can find the similar sentences using following code.

import gensim  

model = gensim.models.Doc2Vec.load('saved_doc2vec_model')  

new_sentence = "I opened a new mailbox".split(" ")  
model.docvecs.most_similar(positive=[model.infer_vector(new_sentence)],topn=5)
Results:

[('TRAIN_29670', 0.6352514028549194),
 ('TRAIN_678', 0.6344441771507263),
 ('TRAIN_12792', 0.6202734708786011),
 ('TRAIN_12062', 0.6163255572319031),
 ('TRAIN_9710', 0.6056315898895264)]
The above results are list of tuples for (label,cosine_similarity_score). You can map outputs to sentences by doing train[29670].

## Sentence splitting

Splitting tests into sentences

In [5]:
import re
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|Mt)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov|me|edu)"
digits = "([0-9])"

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    text = re.sub(digits + "[.]" + digits,"\\1<prd>\\2",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    if "e.g." in text: text = text.replace("e.g.","e<prd>g<prd>") 
    if "i.e." in text: text = text.replace("i.e.","i<prd>e<prd>")
    if "..." in text: text = text.replace("...","<prd><prd><prd>")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences

In [1]:
fname="TextSummary/2173_001.txt"
with open(fname, 'r') as myfile:
      text=myfile.read()
# text = split_into_sentences(text)
# text = str(text).strip('[]')
# text = text.replace("\'", "")

In [46]:
type(text)

str

In [47]:
text 
# = ', '.join(text)


' \n \n \n \nQuestions and Answers \nApplication of the AIFMD \n \n \n \n4 October 2018 | ESMA34-32-352 \n \nDate: 4 October 2018 \nESMA34-32-352 \nContents \nSection I: Remuneration ..............................................................................................5 \nSection II: Notifications of AIFs ...................................................................................9 \nSection III: Reporting to national competent authorities under Articles 3, 24 and 42 . 11 \nSection IV: Notification of AIFMs ............................................................................... 28 \nSection V: MiFID services under Article 6(4) of the AIFMD ....................................... 30 \nSection VI: Depositaries ............................................................................................ 31 \nSection VII: Calculation of leverage ........................................................................... 34 \nSection VIII: Delegation ..................

In [48]:
from gensim.summarization.summarizer import summarize
from gensim.summarization import keywords
 
print ('Summary:')
print (summarize(text, ratio=0.02))

Summary:
Question 6 [last update 16 November 2016]: An AIF is marketed in a host Member State by 
Question 1 [last update 16 December 2016]: When a non-EU AIFM reports information to 
Answer 1: When a non-EU AIFM reports information to the national competent authorities of 
Question 3 [last update 25 March 2014]: Which period should AIFMs use when reporting 
Answer 8: AIFMs should aggregate the market value of all securities traded and report the 
Question  11  [last  update  25  March  2014]:  How  should  AIFMs  report  the  information  on 
Question  11  [last  update  25  March  2014]:  How  should  AIFMs  report  the  information  on 
Question 13 [last update 25 March 2014]: Should AIFMs report the information in English or 
Question  20 [last  update  27  June  2014]:  According  to  Article  24(2)  of the  AIFMD,  AIFMs 
reporting template for AIF-specific information) or value of turnover in each asset class over 
Question 25 [last update 21 July 2014]: AIFMs have to report val

In [2]:
from nltk import tokenize
p = "Good morning Dr. Adams. The patient is waiting for you in room number 3."
tokenize.sent_tokenize(text)


[' \n \n \n \nQuestions and Answers \nApplication of the AIFMD \n \n \n \n4 October 2018 | ESMA34-32-352 \n \nDate: 4 October 2018 \nESMA34-32-352 \nContents \nSection I: Remuneration ..............................................................................................5 \nSection II: Notifications of AIFs ...................................................................................9 \nSection III: Reporting to national competent authorities under Articles 3, 24 and 42 .',
 '11 \nSection IV: Notification of AIFMs ............................................................................... 28 \nSection V: MiFID services under Article 6(4) of the AIFMD ....................................... 30 \nSection VI: Depositaries ............................................................................................ 31 \nSection VII: Calculation of leverage ........................................................................... 34 \nSection VIII: Delegation .............

In [7]:
import numpy
print(numpy.__version__)

1.14.0


In [4]:


# I suspect you don't just want the most common phrases, but rather you want the most interesting collocations. Otherwise, you could end up with an overrepresentation of phrases made up of common words and fewer interesting and informative phrases.

# To do this, you'll essentially want to extract n-grams from your data and then find the ones that have the highest point wise mutual information (PMI). That is, you want to find the words that co-occur together much more than you would expect them to by chance.

# The NLTK collocations how-to covers how to do this in a about 7 lines of code, e.g.:

import nltk
nltk.download('genesis')
  
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()

# change this to read in your data
finder = BigramCollocationFinder.from_words(
   nltk.corpus.genesis.words('english-web.txt'))

# only bigrams that appear 3+ times
finder.apply_freq_filter(3) 

# return the 10 n-grams with the highest PMI
finder.nbest(bigram_measures.pmi, 10)  

[nltk_data] Downloading package genesis to
[nltk_data]     /Users/maartenkool/nltk_data...
[nltk_data]   Package genesis is already up-to-date!


[('Beer', 'Lahai'),
 ('Lahai', 'Roi'),
 ('gray', 'hairs'),
 ('Most', 'High'),
 ('ewe', 'lambs'),
 ('many', 'colors'),
 ('burnt', 'offering'),
 ('Paddan', 'Aram'),
 ('east', 'wind'),
 ('living', 'creature')]