# Idioms and Metaphores

---
## Import Libraries and Download NLTK Data


<div class="alert alert-block alert-info">
<b>Note:</b>
Spacy package works only below Python 3.12 (current is 3.13/3.14)




</div>






In [43]:
import os
import glob
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
import nltk
import pandas as pd

#nltk.download('punkt')


TypeError: ForwardRef._evaluate() missing 1 required keyword-only argument: 'recursive_guard'


## Defining Paths


In [35]:
project_folder = "Datum"
idioms_file_path = "Datum/idioms.txt"
files_text_folder = "Text_Files"


## Reading Files



In [23]:
def read_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read().strip()
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return ""


## Vectorizing Idioms


In [24]:
idioms_text = read_file(idioms_file_path)
if not idioms_text:
    raise ValueError(f"No valid content in {idioms_file_path}.")

vectorizer = TfidfVectorizer(tokenizer=word_tokenize, stop_words='english')
idioms_vector = vectorizer.fit_transform([idioms_text])

N = 20

print("Short List of Idioms Vectorized:")
print(str(idioms_vector)[:127])


Short List of Idioms Vectorized:
  (0, 406)	0.007876702166955815
  (0, 137)	0.005251134777970542
  (0, 678)	0.002625567388985271
  (0, 654)	0.007876702166955815


## Tokenizing Text Files in `Files_Text` Folder


In [25]:
def tokenize_text(text):
    return word_tokenize(text)

files_text_files = glob.glob(os.path.join(files_text_folder, '*.txt'))
tokenized_texts = {}
for file_path in files_text_files:
    file_content = read_file(file_path)
    if file_content:
        tokenized_texts[file_path] = tokenize_text(file_content)
        print(f"Tokenized {file_path} with {len(tokenized_texts[file_path])} tokens.")
    else:
        print(f"Skipped empty file: {file_path}")

# Output tokenized results
for file_path, tokens in tokenized_texts.items():
    print(f"\nFile: {file_path}")
    print(f"Tokens: {tokens[:10]}...")  # Printing only the first 10 tokens for braveness


Tokenized Text_Files\fomcminutes20140129.txt with 13848 tokens.
Tokenized Text_Files\fomcminutes20140319.txt with 14681 tokens.
Tokenized Text_Files\fomcminutes20140430.txt with 7777 tokens.
Tokenized Text_Files\fomcminutes20140618.txt with 15747 tokens.
Tokenized Text_Files\fomcminutes20140730.txt with 9149 tokens.
Tokenized Text_Files\fomcminutes20140917.txt with 16608 tokens.
Tokenized Text_Files\fomcminutes20141029.txt with 9509 tokens.
Tokenized Text_Files\fomcminutes20141217.txt with 14761 tokens.
Tokenized Text_Files\fomcminutes20150128.txt with 16785 tokens.
Tokenized Text_Files\fomcminutes20150318.txt with 15569 tokens.
Tokenized Text_Files\fomcminutes20150429.txt with 8768 tokens.
Tokenized Text_Files\fomcminutes20150617.txt with 15189 tokens.
Tokenized Text_Files\fomcminutes20150729.txt with 9645 tokens.
Tokenized Text_Files\fomcminutes20150917.txt with 14715 tokens.
Tokenized Text_Files\fomcminutes20151028.txt with 9312 tokens.
Tokenized Text_Files\fomcminutes20151216.txt w

## Reading Idioms and Extracting from Text Files in `Text_Files` Folder


In [36]:
# Define new folder path
text_files_folder = 'Text_Files'

idioms_text = read_file(idioms_file_path)
idioms = [line.strip() for line in idioms_text.splitlines() if line.strip()]
if not idioms:
    raise ValueError(f"No valid content in {idioms_file_path}.")

print(f"Extracted {len(idioms)} idioms.")

# Tokenize Text Files
text_files_paths = glob.glob(os.path.join(text_files_folder, '*.txt'))
print(f"Found {len(text_files_paths)} files in {text_files_folder}.")
file_contents = [read_file(file_path) for file_path in text_files_paths]

# Filter out empty texts
file_contents = [text for text in file_contents if text]
text_files_paths = [file_path for file_path, text in zip(text_files_paths, file_contents) if text]

if not text_files_paths:
    raise ValueError("No valid content in Text_Files.")


Extracted 717 idioms.
Found 83 files in Text_Files.


## Check for Exact Matches and Calculate Similarities


In [37]:
%%capture
results = {}
for file_path, content in zip(text_files_paths, file_contents):
    tokens = tokenize_text(content)
    matching_idioms = [idiom for idiom in idioms if idiom.lower() in content.lower()]
    if matching_idioms:
        results[file_path] = matching_idioms
    else:
        results[file_path] = "No exact idiom matches."
'''
# Output exact matches
print("\nExact Matches:")
for file_path, matches in results.items():
    #print(f"\nFile: {file_path}")
    if isinstance(matches, list):
        print(f"  Matching Idioms: {matches} in {file_path}")
    else:
        print(f"  {matches}")
'''


## Vectorization and Similarity Check


In [39]:

vectorizer = TfidfVectorizer(tokenizer=word_tokenize, stop_words='english', lowercase=True, ngram_range=(1, 2))
all_texts = idioms + file_contents
tfidf_matrix = vectorizer.fit_transform(all_texts)

# Separate idioms vectors and text file vectors
idioms_vectors = tfidf_matrix[:len(idioms)]
text_files_vectors = tfidf_matrix[len(idioms):]

similarities = cosine_similarity(text_files_vectors, idioms_vectors)

similarity_threshold = 0.1    # After decreasing it starts associating with completely different words
print("\nSimilarity Results:")
for idx, file_path in enumerate(text_files_paths):
    similar_idioms = [idioms[j] for j in range(len(idioms)) if similarities[idx][j] >= similarity_threshold]
    if similar_idioms:
        print(f"\nFile: {file_path}")
        print(f"  Similar Idioms (threshold {similarity_threshold}): {similar_idioms}")



Similarity Results:

File: Text_Files\fomcminutes20220921.txt
  Similar Idioms (threshold 0.1): ['across the board', 'get on board']

File: Text_Files\fomcminutes20221214.txt
  Similar Idioms (threshold 0.1): ['across the board', 'get on board']

File: Text_Files\fomcminutes20230201.txt
  Similar Idioms (threshold 0.1): ['across the board', 'get on board']

File: Text_Files\fomcminutes20230322.txt
  Similar Idioms (threshold 0.1): ['across the board', 'get on board']

File: Text_Files\fomcminutes20230503.txt
  Similar Idioms (threshold 0.1): ['across the board', 'get on board']

File: Text_Files\fomcminutes20230614.txt
  Similar Idioms (threshold 0.1): ['across the board', 'get on board']

File: Text_Files\fomcminutes20230726.txt
  Similar Idioms (threshold 0.1): ['across the board', 'get on board']

File: Text_Files\fomcminutes20230920.txt
  Similar Idioms (threshold 0.1): ['across the board', 'get on board']

File: Text_Files\fomcminutes20231101.txt
  Similar Idioms (threshold 0.1):


# From Mixed Nuts to Metaphors: Decoding the Nuts and Bolts of Figurative Speech with SpaCy


In [40]:
# Load the spaCy English model
nlp = spacy.load("en_core_web_sm")

# Function to analyze metaphors in a given text
def analyze_metaphors(text):
    doc = nlp(text)
    metaphor_count = 0
    for token in doc:
        if token.dep_ == "metaphor":
            metaphor_count += 1
    return metaphor_count

# Function to read a text file and return its content as a striiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiing
def read_text_file(filepath):
    with open(filepath, "r", encoding="utf-8") as file:
        text = file.read()
    return text

# Get the path to the Text_Files folder
text_folder_path = "Text_Files"

# Get all filenames within the folder
filenames = os.listdir(text_folder_path)

list_as_empty_as_my_wallet = []

# Loopy loop
for filename in filenames:
    metaphor_n = analyze_metaphors(text)
    list_as_empty_as_my_wallet.append(metaphor_n)
print(list_as_empty_as_my_wallet)


ModuleNotFoundError: No module named 'spacy'