In [2]:
import nltk
from nltk.corpus import brown
from collections import Counter

def find_common_parts(words, part_length):
    parts = Counter()
    for word in words:
        for i in range(len(word) - part_length + 1):
            part = word[i:i+part_length]
            parts[part] += 1
    return parts.most_common(1)[0]

# Ensure the Brown Corpus is downloaded
nltk.download('brown')

# Get the words from the Brown Corpus
words = brown.words()

# Find hapax words
fdist = nltk.FreqDist(words)
hapax_words = fdist.hapaxes()

part_length = 3  # replace with your desired part length
common_prefix = find_common_parts(hapax_words, part_length)
common_suffix = find_common_parts([word[::-1] for word in hapax_words], part_length)
common_infix = find_common_parts(hapax_words, part_length)

print(f'Most common prefix: {common_prefix}')
print(f'Most common suffix: {common_suffix}')
print(f'Most common infix: {common_infix}')



[nltk_data] Downloading package brown to C:\Users\Kinjalk
[nltk_data]     Parth\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\brown.zip.


Most common prefix: ('ing', 2113)
Most common suffix: ('gni', 2113)
Most common infix: ('ing', 2113)


In [17]:
# Define a function to calculate the probability
def calculate_probability(word_tag_pairs):
    total_count = len(word_tag_pairs)
    ing_count = 0
    
    for word, tag in word_tag_pairs:
        if word.endswith('ed') and tag == 'VERB':
            ing_count += 1
    
    probability = ing_count / total_count
    return probability

# Read the file
file_path = 'your_file.txt'  # Replace with the actual file path
word_tag_pairs = []

with open('hapax_word_tag_pairs.txt', 'r') as file:
    for line in file:
        try:
            word,tag = line.strip().split(',')
            word_tag_pairs.append((word, tag))
        except:
            pass

# Calculate the probability
probability = calculate_probability(word_tag_pairs)

print(f"Probability that a word ending with 'ing' is tagged as NOUN: {probability:.2%}")


Probability that a word ending with 'ing' is tagged as NOUN: 5.63%


In [46]:
import re

# Define a function to calculate the probability based on a given pattern
def calculate_probability(word_tag_pairs, pattern, tag):
    total_count = len(word_tag_pairs)
    matching_count = 0
    
    for word, word_tag in word_tag_pairs:
        if re.search(pattern, word) and word_tag == tag:
            matching_count += 1
    
    probability = matching_count / total_count
    return probability

# Read the file
file_path = 'your_file.txt'  # Replace with the actual file path
word_tag_pairs = []

with open('hapax_word_tag_pairs.txt', 'r') as file:
    for line in file:
        try:
            word,tag = line.strip().split(',')
            word_tag_pairs.append((word, tag))
        except:
            pass

# Define your pattern and desired tag
your_pattern = r'ation$'  # This pattern matches words ending with 'ing'
your_tag = 'NOUN'

# Calculate the probability based on your pattern and tag
probability = calculate_probability(word_tag_pairs, your_pattern, your_tag)

print(f"Probability that words matching the pattern '{your_pattern}' are tagged as {your_tag}: {probability:.2%}")


Probability that words matching the pattern 'ation$' are tagged as NOUN: 1.00%


In [47]:
from collections import defaultdict

def find_common_affixes(data, affix_length):
    prefixes = defaultdict(list)
    suffixes = defaultdict(list)
    infixes = defaultdict(list)

    for word, tag in data:
        if len(word) > 2 * affix_length:
            prefix = word[:affix_length]
            suffix = word[-affix_length:]
            infix = word[affix_length:-affix_length]

            prefixes[tag].append(prefix)
            suffixes[tag].append(suffix)
            infixes[tag].append(infix)

    common_prefixes = {tag: max(set(prefixes), key=prefixes.count) for tag, prefixes in prefixes.items()}
    common_suffixes = {tag: max(set(suffixes), key=suffixes.count) for tag, suffixes in suffixes.items()}
    common_infixes = {tag: max(set(infixes), key=infixes.count) for tag, infixes in infixes.items()}

    return common_prefixes, common_suffixes, common_infixes

# Your data goes here
data = [('this\'ll', 'DET'), ('das', 'DET'), ('tannenbaum', 'NOUN'), ('$.09', 'NOUN')]
with open('hapax_word_tag_pairs.txt', 'r') as file:
    for line in file:
        try:
            word,tag = line.strip().split(',')
            data.append((word, tag))
        except:
            pass

common_prefixes, common_suffixes, common_infixes = find_common_affixes(data, affix_length=2)

print("Common Prefixes:", common_prefixes)
print("Common Suffixes:", common_suffixes)
print("Common Infixes:", common_infixes)


Common Prefixes: {'DET': 'th', 'NOUN': 'co', 'ADJ': 'un', 'VERB': 're', 'IN': 'co', 'CONJ': 'su', 'ADV': 'un', 'PRON': 'th', 'NUM': '19', 'MODAL': 'sh', 'PART': 'af', 'UH': 'go', 'X': 'wh'}
Common Suffixes: {'DET': 'er', 'NOUN': "'s", 'ADJ': 'ed', 'VERB': 'ed', 'IN': 'er', 'CONJ': 'ng', 'ADV': 'ly', 'PRON': "'s", 'NUM': '/2', 'MODAL': 'da', 'PART': 'er', 'UH': 'it', 'X': 'ed'}
Common Infixes: {'DET': "is'", 'NOUN': 'a', 'ADJ': '-year-o', 'VERB': 'i', 'IN': 'th-but-aft', 'CONJ': 'pposi', 'ADV': 'ere', 'PRON': 'body', 'NUM': '-', 'MODAL': 'ul', 'PART': 't', 'UH': '-da-da-d', 'X': 'i'}
