In [1]:
import pandas as pd
import numpy as np
import random
import urllib.request
import nltk
nltk.download('words', quiet=True)
nltk.download('names', quiet=True)

# Load the CMU Pronouncing Dictionary from the internet
url = "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b"
with urllib.request.urlopen(url) as f:
    cmudict = f.read().decode("ISO-8859-1")

In [2]:
import pyphen

def split_into_morphemes(word):
    # Initialize the Pyphen dictionary
    dic = pyphen.Pyphen(lang='en')
    
    # Split the word into morphemes
    morphemes = dic.inserted(word).split('-')
    
    return morphemes

# Example usage:
word = "telepathy"
morphemes = split_into_morphemes(word)
print(morphemes)

['tele', 'pathy']


In [3]:
import pyphonetics
import pronouncing


# Define a function that generates homophones for a given word
def generate_homophones(word):
    homophones = []
    # Create an instance of the Soundex algorithm
    soundex = pyphonetics.RefinedSoundex() #There are many others but this is the best one
    # Convert the input word to its Soundex code
    try:
        soundex_code = soundex.phonetics(word)

    # Compare the Soundex code of the input word with the Soundex code of each word in the CMU Pronouncing Dictionary
        for line in cmudict.splitlines():
            if line.startswith(";;;"):
                continue
            line = line.strip().split()
            pron = "".join(line[1:])
            pron_code = soundex.phonetics(pron)
            if soundex_code == pron_code:
                homophones.append(line[0])

        # Return the list of homophones
        homophones.extend(pronouncing.rhymes(word))
    except IndexError:
        return pronouncing.rhymes(word)
    return list(set(homophones))

In [4]:
import re

def clean_words(words): # Some word have special characters

    clean_words = []
    for word in words:
        clean_word = re.sub(r'[^a-zA-Z]', '', word)
        if clean_word and is_english_word_or_name(word):
            clean_words.append(clean_word)
    return [w.upper() for w in clean_words]

In [5]:
def put_word(sentence, word, potential_replacement):
    if potential_replacement is None or word is None:
        return sentence
    else:
        return sentence.replace(" " + word + " ", " " + potential_replacement + " ")

In [6]:
import random

def replace_word(word, word_list): #if there are no homophones i use this to return the word as it it, else i'll select a random homophone
    if not word_list:
        return word
    else:
        return random.choice(sorted(word_list))

In [7]:
def generate_morpheme_from_sentence(sentence):
    wordlist= []
    for word in sentence.split():
        wordlist.append(split_into_morphemes(word))
        
    return wordlist

In [8]:

def is_english_word_or_name(word): #APIS sometimes return with names and soetimes return just false words

    # Download the English names corpus if necessary
    nltk.download('names', quiet=True)
    
    # Get a set of English words and names
    english_words = set(nltk.corpus.words.words())
    english_names = set(nltk.corpus.names.words())
    
    # Check if the word is in the set of English words or names
    if word.lower() in english_words or word.title() in english_names:
        return True
    else:
        return False


In [9]:
def flatten_list_of_lists(list_of_lists):
    result = []
    for inner_list in list_of_lists:
        result.extend(inner_list)
    return result

In [10]:
def generate_ngram_dataset(path):
    data = pd.read_excel(path)
    data = data[['Target','sentence 1', 'sentence 2']]
    data['sentence 3'] = np.nan
    
    
    for index in range(len(data)+1):
        if index<805:continue
        columns = list(data.columns)
        columns.remove('sentence 3')
        columns.remove('Target')
        while columns:
            column = random.choice(columns)
            senten = data[column][index]
            data.loc[index, 'sentence 3'] = senten
            new_sentence_morph3D = generate_morpheme_from_sentence(senten)
            new_new_sentence_morph = flatten_list_of_lists(new_sentence_morph3D)

            if len(senten.split())==len(new_new_sentence_morph):
                columns.remove(column)
            else:
                morphed = ' '.join(new_new_sentence_morph)
                for w in new_new_sentence_morph:
                    if w.islower():
                        potential_replacement = replace_word(w,list(set(clean_words(generate_homophones(w)))))
                        morphed = put_word(morphed, w, potential_replacement)
                    else:continue
                    
                data.loc[index, 'sentence 3'] = morphed
                break
            
        print('Finished with iteration :', index)
        data.to_excel('final_moph_dataset.xlsx')
    
    return data

In [11]:
dff = generate_ngram_dataset('final-dataset-3005.xlsx')
dff.head()

Finished with iteration : 805
Finished with iteration : 806
Finished with iteration : 807
Finished with iteration : 808
Finished with iteration : 809
Finished with iteration : 810
Finished with iteration : 811
Finished with iteration : 812
Finished with iteration : 813
Finished with iteration : 814
Finished with iteration : 815
Finished with iteration : 816
Finished with iteration : 817
Finished with iteration : 818
Finished with iteration : 819
Finished with iteration : 820
Finished with iteration : 821
Finished with iteration : 822
Finished with iteration : 823
Finished with iteration : 824
Finished with iteration : 825
Finished with iteration : 826
Finished with iteration : 827
Finished with iteration : 828
Finished with iteration : 829
Finished with iteration : 830
Finished with iteration : 831
Finished with iteration : 832
Finished with iteration : 833
Finished with iteration : 834
Finished with iteration : 835
Finished with iteration : 836
Finished with iteration : 837
Finished w

KeyboardInterrupt: 