In [None]:
import json
import spacy
import nltk
from difflib import SequenceMatcher
from spacy import displacy

# Data Fetching & Pre Processing

In [None]:
dataset_path = '/content/idiom_repository_all.json'
with open(dataset_path,'r') as file:
  data=json.load(file)

In [None]:
len(data)

9721

In [None]:
#cleaning the data

data=[idiom for idiom in data if idiom["entry"]!=[] ]
len(data)

9514

In [None]:
# @title
#testing data=(sentences(usages),idiom)
test_data={}
for idiom in data:
  entries= idiom['entry']
  for e in entries:
    usages=e['usages'][0]
    for sent in usages:
      test_data[sent]=idiom['idiom']
test_data

# Detection Algorithm

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
sentence = "John was painting the big town red"
doc = nlp(sentence)

# Visualize the dependency parse tree
displacy.render(doc, style="dep", jupyter=True)
# Save the dependency parse tree visualization as an image
displacy.render(doc, style="dep", jupyter=False, options={"distance": 90})
image_path = "dependency_parse_tree.png"
displacy.serve(doc, style="dep", options={"distance": 90})


Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [None]:
#look for phrase in its respective part of speech dataset

def tag_idioms(phrase,pos, dataset):
    doc1=nlp(phrase)
    maxscore=0.84 if len(phrase.split())<=2 else 0.67; #more accuracy for 2 word phrases
    idiom=None
    for idiom_data in dataset:
      i = idiom_data["idiom"]
      score=SequenceMatcher(None, i.lower(), phrase.lower()).ratio()
      if maxscore<score:
        maxscore=score
        idiom=i
    return idiom

In [None]:
# to reduce look up time we divide the data set on pos

def reduce_dataset(dataset,pos):
    newdataset=[]
    for idiom in dataset:
      if idiom['entry']:
        for entry_pos in idiom['entry']:
          if pos in entry_pos['pos']:
            newdataset.append(idiom)
    return newdataset

verb_dataset= reduce_dataset(data,'verb')
noun_dataset= reduce_dataset(data,'noun')
# propernoun_dataset= reduce_dataset(data,'proper noun')
# pronoun_dataset= reduce_dataset(data,'pronoun')
# adverb_dataset= reduce_dataset(data,'adverb')

In [None]:
# We try to fetch phrases from a complete sentence given in input

def fetch_phrases(sentence):
  doc=nlp(sentence)
  phrases={'noun':set(),'verb':[]}
  #noun phrase
  for chunk in doc.noun_chunks:
    phrases['noun'].add(chunk.text)
  phrases['noun'] = list(phrases['noun'])
  #verb phrase
  phrases['verb']=verb_chunks(doc)
  return phrases

def verb_chunks(doc):
  chunks=set()
  for token in doc:
    if token.pos_ in ['AUX','VERB']:
      subtree=[node for node in token.subtree]
      arcwords=[]
      for child in subtree[subtree.index(token):]:
        arcwords.append(child.text)
        if child.dep_ in (['dobj','attr','aux','prt'] if  token.dep_ =='ROOT' else ['dobj','attr','prt']):
          break
      if token.dep_ =='ROOT' or token.dep_=='advcl':
        chunks.add(' '.join([right.text for right in subtree[subtree.index(token):]]))
      chunks.add(' '.join(arcwords))
  return list() if chunks==set() else list(chunks)


# Testing & Evaluating

In [None]:
def process_text(text):
  doc = nlp(text)
  idioms=[]
  for sentence in doc.sents:
    phrases=fetch_phrases(sentence.text)
    print(f'My phrases : \n {phrases}')
    for pos in phrases:
      posdataset={}
      if pos=='noun':
        posdataset=noun_dataset
      elif pos=='verb':
        posdataset=verb_dataset
      # elif pos=='adverb':
      #   posdataset=adverb_dataset
      for p in phrases[pos]:
        i=tag_idioms(p,pos,posdataset)
        if i:
          idioms.append(i)
  # print(idioms)
  return [] if set(idioms)==set() else set(idioms)


text="John was painting the town green"
process_text(text)

My phrases : 
 {'noun': ['John'], 'verb': ['painting the town green', 'was']}


{'paint the town red'}

**Calculating Accuracy**

In [None]:
#testing with cases that may/may not match with our data set
test_input = [
    {"when i came home, he let me have it for wrecking the car": ["let someone have it"]},
    {"She walked to the store to buy some groceries.": []},
    {"John and Mary painted the town red.": ["paint the town red"]},
    {"The quick brown fox jumps over the lazy dog.": []},
    {"He spilled the beans about the surprise party.": ["spill the beans"]},
    {"The team hit the ground running after the pep talk.": []},
    {"She decided to bite the bullet and face her fears.": ["bite the bullet"]},
    {"The professor hit the nail on the head with his analysis.": ["hit the nail on the head"]},
    {"The new employee quickly learned the ropes of the job":[]},
    {"I was caught in the rain without an umbrella.": []},
    {"The children played in the park until the sun went down.": []},
    {"He found himself in hot water when he forgot about the deadline.": ["in hot water"]},
    {"The singer hit the high notes effortlessly during the performance.": []},
    {"She decided to take the bull by the horns and confront the issue directly.": ["take the bull by the horns"]},
    {"The detective followed the clues to solve the mystery.": []},
    {"The students put their heads together to come up with a creative solution.": []},
    {"The chef added the final touch to the dish before serving it.": ["add the final touch"]},
    {"The marathon runners crossed the finish line with smiles on their faces.": []},
    {"He faced an uphill battle when trying to change the company's policies.": ["uphill battle"]},
    {"The cat and the dog had a stare-down in the backyard.": []},
    {"The construction workers broke ground on the new building project.": ["break ground"]},
    {"After the long journey, they finally reached the light at the end of the tunnel.": []},
    {"She tried to read between the lines of the mysterious letter.": ["read between the lines"]},
    {"The comedian had the audience in stitches with his hilarious jokes.": ["in stitches"]},
    {"The hikers reached the summit and enjoyed the breathtaking view.": []},
    {"He had to play it by ear when the original plan fell through.": ["play it by ear"]},
    {"The magician pulled a rabbit out of the hat to the amazement of the audience.": ["pull a rabbit out of the hat"]},
    {"The project required attention to detail to ensure its success.": ["attention to detail"]},
    {"She was on cloud nine after receiving the promotion at work.": ["on cloud nine"]},
    {"The sun set, casting a golden hue across the tranquil lake.": []},
    {"The mechanic fixed the car, and now it runs like a well-oiled machine.": ["well-oiled machine"]},
    {"The artist threw in the towel after struggling with the difficult painting.": ["throw in the towel"]},
    {"The friends had a heart-to-heart conversation about their feelings.": ["heart-to-heart"]},
    {"The photographer captured the moment with a click of the camera.": []},
    {"The project manager kept everyone on the same page with regular updates.": ["on the same page"]},
    {"The boxer was knocked out, and the referee counted to ten.": ["count to ten"]},
    {"She hit the jackpot when she found the rare collectible at the flea market.": ["hit the jackpot"]},
    {"The entrepreneur decided to think outside the box for innovative solutions.": ["think outside the box"]},
    {"The gardener planted the seeds and hoped for a fruitful harvest.": ["fruitful harvest"]},
    {"The musician was in tune with the rhythm of the song.": ["in tune"]},
    {"The students were in the dark about the surprise quiz until it started.": ["in the dark"]},
    {"The chef added a pinch of salt to enhance the flavor of the dish.": ["pinch of salt"]},
    {"The athlete was on thin ice with the coach after missing practice.": ["on thin ice"]},
    {"The friends built bridges instead of walls to strengthen their relationships.": ["build bridges"]}
]

In [None]:
for entry in test_input:
  sentence, idiom = list(entry.items())[0]
  print(f"Expected Output : {idiom}")
  print(f"Our Output : {process_text(sentence)}\n\n")

Expected Output : ['let someone have it']
My phrases : 
 {'noun': ['i', 'it', 'the car', 'me', 'he'], 'verb': ['let me have it', 'wrecking the car', 'let me have it for wrecking the car', 'have it', 'came home']}
Our Output : {'let them have it', 'have at'}


Expected Output : []
My phrases : 
 {'noun': ['the store', 'She', 'some groceries'], 'verb': ['walked to the store to buy some groceries .', 'buy some groceries', 'walked to the store to']}
Our Output : []


Expected Output : ['paint the town red']
My phrases : 
 {'noun': ['John', 'Mary', 'the town red'], 'verb': ['painted the town red', 'painted the town red .']}
Our Output : {'paint the town red'}


Expected Output : []
My phrases : 
 {'noun': ['the lazy dog', 'The quick brown fox'], 'verb': ['jumps over the lazy dog .']}
Our Output : []


Expected Output : ['spill the beans']
My phrases : 
 {'noun': ['He', 'the surprise party', 'the beans'], 'verb': ['spilled the beans', 'spilled the beans about the surprise party .']}
Our Outp

In [None]:
# @title
def extract_phrases(text):
    # Tokenization
    tokens = nltk.word_tokenize(text)
    # Part-of-Speech Tagging
    pos_tags = nltk.pos_tag(tokens)

    # Phrase Chunking
    grammar = r"""
        NP: {<DT>?<JJ>*<NN>}
        ADJP: {<JJ>}
        ADVP: {<RB>}
        VP: {<VB.*>+<NP|PP|CLAUSE>*}
    """
    chunk_parser = nltk.RegexpParser(grammar)
    chunks = chunk_parser.parse(pos_tags)
    # Extract phrases
    phrases = []
    for subtree in chunks.subtrees(filter=lambda t: t.label() in ['NP', 'ADJP', 'ADVP', 'VP']):
        category = subtree.label()
        phrase = " ".join(word for word, tag in subtree.leaves())
        phrases.append(" ".join(word for word, tag in subtree.leaves()))
        print(f"{category}: {phrase}")
    return phrases

# Example usage:
text_to_process = 'My mom was feeling a bit under the weather'
#"Navigating through the labyrinth of life, Jake decided to bite the bullet and spill the beans about his secret project, letting the cat out of the bag. As he took the bull by the horns, he realized that he was walking on eggshells, hoping not to stir the hornet's nest. Despite facing an uphill battle, he knew that every cloud has a silver lining and that revealing the ace up his sleeve was a piece of cake. As he awaited the reaction, he kept his fingers crossed, knowing that the ball was in their court, and it was time to play his cards right."
result = extract_phrases(text_to_process)

print("Extracted Phrases:")
print(result)


In [None]:
input_text="There is no idiom in this text"
process_text(input_text)

My phrases : 
 {'noun': ['no idiom', 'this text'], 'verb': ['is no idiom in this text', 'is no idiom']}


[]