# Idioms and Metaphores

---
## Import Libraries and Download NLTK Data


In [18]:
import os
import glob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
import nltk
import pandas as pd

nltk.download('punkt')


True


## Define Paths


In [13]:
project_folder = './'  # Modify this if your project folder is located elsewhere
idioms_file_path = os.path.join(project_folder, 'idioms.txt')
files_text_folder = os.path.join(project_folder, 'Files_Text')


## Function to Read File


In [14]:
def read_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read().strip()
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return ""


## Read and Vectorize Idioms


In [41]:
idioms_text = read_file(idioms_file_path)
if not idioms_text:
    raise ValueError(f"No valid content in {idioms_file_path}.")

vectorizer = TfidfVectorizer(tokenizer=word_tokenize, stop_words='english')
idioms_vector = vectorizer.fit_transform([idioms_text])

N = 20

print("Short List of Idioms Vectorized:")
print(str(idioms_vector)[:127])


Short List of Idioms Vectorized:
  (0, 406)	0.007876702166955815
  (0, 137)	0.005251134777970542
  (0, 678)	0.002625567388985271
  (0, 654)	0.007876702166955815


## Tokenize Text Files in `Files_Text` Folder


In [42]:
def tokenize_text(text):
    return word_tokenize(text)

files_text_files = glob.glob(os.path.join(files_text_folder, '*.txt'))
tokenized_texts = {}
for file_path in files_text_files:
    file_content = read_file(file_path)
    if file_content:
        tokenized_texts[file_path] = tokenize_text(file_content)
        print(f"Tokenized {file_path} with {len(tokenized_texts[file_path])} tokens.")
    else:
        print(f"Skipped empty file: {file_path}")

# Output tokenized results
for file_path, tokens in tokenized_texts.items():
    print(f"\nFile: {file_path}")
    print(f"Tokens: {tokens[:10]}...")  # Printing only the first 10 tokens for brevity


## Read Idioms and Extract from Text Files in `Text_Files` Folder


In [43]:
# Define new folder path
text_files_folder = os.path.join(project_folder, 'Text_Files')

idioms_text = read_file(idioms_file_path)
idioms = [line.strip() for line in idioms_text.splitlines() if line.strip()]
if not idioms:
    raise ValueError(f"No valid content in {idioms_file_path}.")

print(f"Extracted {len(idioms)} idioms.")

# Tokenize Text Files
text_files_paths = glob.glob(os.path.join(text_files_folder, '*.txt'))
print(f"Found {len(text_files_paths)} files in {text_files_folder}.")
file_contents = [read_file(file_path) for file_path in text_files_paths]

# Filter out empty texts
file_contents = [text for text in file_contents if text]
text_files_paths = [file_path for file_path, text in zip(text_files_paths, file_contents) if text]

if not text_files_paths:
    raise ValueError("No valid content in Text_Files.")


Extracted 717 idioms.
Found 83 files in ./Text_Files.


## Check for Exact Matches and Calculate Similarities


In [48]:
%%capture
results = {}
for file_path, content in zip(text_files_paths, file_contents):
    tokens = tokenize_text(content)
    matching_idioms = [idiom for idiom in idioms if idiom.lower() in content.lower()]
    if matching_idioms:
        results[file_path] = matching_idioms
    else:
        results[file_path] = "No exact idiom matches."
'''
# Output exact matches
print("\nExact Matches:")
for file_path, matches in results.items():
    #print(f"\nFile: {file_path}")
    if isinstance(matches, list):
        print(f"  Matching Idioms: {matches} in {file_path}")
    else:
        print(f"  {matches}")
'''


## Vectorization and Similarity Check


In [52]:
%%capture
vectorizer = TfidfVectorizer(tokenizer=word_tokenize, stop_words='english', lowercase=True, ngram_range=(1, 2))
all_texts = idioms + file_contents
tfidf_matrix = vectorizer.fit_transform(all_texts)

# Separate idioms vectors and text file vectors
idioms_vectors = tfidf_matrix[:len(idioms)]
text_files_vectors = tfidf_matrix[len(idioms):]

# similarities
similarities = cosine_similarity(text_files_vectors, idioms_vectors)

# Output similarity results
similarity_threshold = 0.1  # it's already low, just like my will to live
print("\nSimilarity Results:")
for idx, file_path in enumerate(text_files_paths):
    similar_idioms = [idioms[j] for j in range(len(idioms)) if similarities[idx][j] >= similarity_threshold]
    if similar_idioms:
        print(f"\nFile: {file_path}")
        print(f"  Similar Idioms (threshold {similarity_threshold}): {similar_idioms}")



# From Mixed Nuts to Metaphors: Decoding the Nuts and Bolts of Figurative Speech with SpaCy


In [61]:
import spacy

# Load the spaCy English model
nlp = spacy.load("en_core_web_sm")

# Function to analyze metaphors in a given text
def analyze_metaphors(text):
    doc = nlp(text)
    metaphor_count = 0
    for token in doc:
        if token.dep_ == "metaphor":
            metaphor_count += 1
    return metaphor_count

# Function to read a text file and return its content as a striiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiing
def read_text_file(filepath):
    with open(filepath, "r", encoding="utf-8") as file:
        text = file.read()
    return text

# Get the path to the Text_Files folder
text_folder_path = "Text_Files"

# Get all filenames within the folder
filenames = os.listdir(text_folder_path)

list_as_empty_as_my_wallet = []

# Loopy loop
for filename in filenames:
    metaphor_n = analyze_metaphors(text)
    list_as_empty_as_my_wallet.append(metaphor_n)
print(list_as_empty_as_my_wallet)


[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
