In [1]:
import random
from collections import Counter




In [36]:
import random
import string

def generate_corpus(max_frequency=100, corpus_size=1000):
    """
    Generate a corpus of words with varying frequencies.

    Parameters:
    - max_frequency: The maximum frequency for any word (default is 100).
    - corpus_size: The size of the generated corpus (default is 1000).

    Returns:
    A list representing the generated corpus.
    """
    # Generate a vocabulary of unique words (up to 100)
    vocabulary_size = min(100, corpus_size)
    vocabulary = [''.join(random.choice(string.ascii_lowercase) for _ in range(5)) for _ in range(vocabulary_size)]

    # Generate frequencies for each word
    frequencies = {word: random.randint(1, max_frequency) for word in vocabulary}

    # Create the corpus
    corpus = []
    for word, frequency in frequencies.items():
        corpus.extend([word] * frequency)

    # Shuffle the corpus to randomize word order
    random.shuffle(corpus)

    # Trim the corpus to the specified size
    corpus = corpus[:corpus_size]

    return corpus

# Example usage
generated_corpus = generate_corpus(max_frequency=1000, corpus_size=1000)
Counter(generated_corpus)  # Count the occurrences of each word in the generated corpus

Counter({'xcftb': 28,
         'osoym': 25,
         'ubscz': 21,
         'lfrel': 20,
         'egfvo': 19,
         'vkczm': 19,
         'fhaul': 19,
         'umzgi': 18,
         'yjzis': 18,
         'imlvx': 18,
         'gqyot': 18,
         'gjmbm': 18,
         'akovc': 18,
         'kuabe': 18,
         'bspox': 18,
         'fynxy': 17,
         'ynepk': 17,
         'jbvqy': 16,
         'ntiks': 16,
         'uaume': 16,
         'ntaou': 16,
         'tkadm': 15,
         'yciee': 15,
         'jqlbn': 15,
         'exhii': 15,
         'ddpcb': 15,
         'eompb': 15,
         'ozyhd': 14,
         'jqfnq': 14,
         'qaooj': 14,
         'nliut': 14,
         'ruksa': 14,
         'aubdh': 13,
         'cpplt': 13,
         'sctfw': 13,
         'cpytr': 12,
         'pctsf': 12,
         'rolmd': 12,
         'csbcw': 12,
         'uirdc': 11,
         'yytls': 11,
         'srgzk': 11,
         'ppejm': 11,
         'hnzvl': 11,
         'tstku': 11,
         '

In [37]:
from collections import Counter

def equalise_word_count(words, augmentation_factor=20):
    # Count the occurrences of each word
    word_counts = Counter(words)
    new_list = words.copy()
    max_word = max(word_counts.values())

    # Augment the corpus by adding additional copies of each word
    for word, count in word_counts.items():
        new_list.extend([word] * (augmentation_factor * (max_word - count)))

    # Shuffle the corpus to randomize word order
    random.shuffle(new_list)

    return new_list




In [38]:

output_words = equalise_word_count(generated_corpus)

Counter(output_words)

Counter({'yzoep': 541,
         'zfhqd': 541,
         'jwlwc': 541,
         'oefqx': 541,
         'gcxwz': 541,
         'xpvue': 541,
         'ytjck': 541,
         'ktjph': 541,
         'fvzwr': 541,
         'zggab': 541,
         'jvtiy': 541,
         'dcxpc': 522,
         'cpnbm': 522,
         'mprpe': 522,
         'aattx': 522,
         'rmwzs': 503,
         'ugpxh': 503,
         'vbkbp': 484,
         'kixss': 484,
         'okyuo': 484,
         'ravbz': 465,
         'czqha': 465,
         'qrnsi': 465,
         'zuxiv': 465,
         'zvlxl': 465,
         'pvdli': 446,
         'uomhl': 446,
         'jplef': 446,
         'gdcsh': 446,
         'btvan': 446,
         'wloyj': 446,
         'yuzuk': 446,
         'afewg': 427,
         'ofehc': 427,
         'elnnt': 427,
         'vuqwg': 427,
         'atibm': 427,
         'jigwg': 427,
         'khrso': 408,
         'uitai': 408,
         'nqgff': 408,
         'nsbsd': 408,
         'ahksm': 408,
         'j

In [91]:
from math import floor


def weighted_corpus_augmentation(corpus, augmentation_factor):
    # Count the occurrences of each word in the original corpus
    word_counts = Counter(corpus)
    augmented_corpus = []

    # Calculate weights for each word based on its frequency
    total_words = len(corpus)
    weights = {word: count / total_words for word, count in word_counts.items()}
    augmentation_size = int(augmentation_factor * total_words)
    print(augmentation_size)
    max_weight = max(weights.values())
    max_scaled = max_weight * augmentation_size
    max_inverse = int(floor(1/max_scaled))

    for word, weight in weights.items():
        if len(augmented_corpus) < augmentation_size:
            scaled = weight * augmentation_size
            inverse = int(floor(1/scaled))
            print(inverse)
            if inverse > 1 and len(augmented_corpus) + inverse <= augmentation_size:
                augmented_corpus.extend([word] * inverse)
        else:
            break

    augmented_corpus = corpus + augmented_corpus
    return augmented_corpus

In [95]:
augmented_corpus = weighted_corpus_augmentation(generated_corpus, augmentation_factor=0.1)
print(len(augmented_corpus))

In [95]:
Counter(generated_corpus)

In [94]:
Counter(augmented_corpus)

Counter({'xcftb': 28,
         'osoym': 25,
         'ubscz': 21,
         'lfrel': 20,
         'egfvo': 19,
         'vkczm': 19,
         'fhaul': 19,
         'umzgi': 18,
         'yjzis': 18,
         'imlvx': 18,
         'gqyot': 18,
         'gjmbm': 18,
         'akovc': 18,
         'kuabe': 18,
         'bspox': 18,
         'fynxy': 17,
         'ynepk': 17,
         'jbvqy': 16,
         'ntiks': 16,
         'uaume': 16,
         'ntaou': 16,
         'tkadm': 15,
         'yciee': 15,
         'jqlbn': 15,
         'exhii': 15,
         'ddpcb': 15,
         'eompb': 15,
         'ozyhd': 14,
         'jqfnq': 14,
         'qaooj': 14,
         'nliut': 14,
         'ruksa': 14,
         'aubdh': 13,
         'cpplt': 13,
         'sctfw': 13,
         'cpytr': 12,
         'pctsf': 12,
         'rolmd': 12,
         'csbcw': 12,
         'uirdc': 11,
         'yytls': 11,
         'srgzk': 11,
         'ppejm': 11,
         'hnzvl': 11,
         'tstku': 11,
         '

In [35]:
Counter(augmented_corpus)

Counter({'ihqqe': 12776351,
         'enjdu': 10373332,
         'pknef': 8609176,
         'gndzl': 8609114,
         'ftnph': 7419216,
         'tiyle': 7418967,
         'uvxab': 6526057,
         'olznx': 4821156,
         'rjvsn': 4442220,
         'sxyaq': 4121867,
         'uiqhr': 4121620,
         'vagzl': 3054421,
         'poozk': 3053822,
         'uomnu': 3053770,
         'rgifa': 2359940,
         'oeuwr': 2359236,
         'pjosw': 2276208,
         'fzuof': 2199281,
         'pkuxr': 2198556,
         'rqstp': 2129017,
         'ywpdg': 2000108,
         'lfypo': 1942482,
         'ulota': 1665608,
         'ulxyp': 1592441,
         'sxwsb': 1557522,
         'fyoik': 1412807,
         'nfbbv': 1362739,
         'mbthe': 1317288,
         'vibyb': 1237895,
         'fzgjd': 1184924,
         'zhnrv': 1138223,
         'jahyy': 1083133,
         'czinc': 970394,
         'mgdnk': 942627,
         'aegpb': 925258,
         'hrbfm': 916840,
         'eefxd': 909159,
    

In [111]:
import random

def augment_list(original_list, augmentation_percentage):
    # Count the frequency of each word in the original list
    word_counts = {}
    for word in original_list:
        word_counts[word] = word_counts.get(word, 0) + 1

    # Identify less frequent words
    unique_words = list(set(original_list))
    less_frequent_words = sorted(unique_words, key=lambda x: word_counts[x])

    # Determine the number of words to add for augmentation
    num_words_to_add = int(len(original_list) * augmentation_percentage)

    # Augment the list with less frequent words
    augmented_list = original_list.copy()
    for _ in range(num_words_to_add):
        # Choose a word with higher probability for less frequent words
        chosen_word = random.choices(less_frequent_words)[0]
        augmented_list.append(chosen_word)

    return augmented_list

# Example usage
original_list = ["apple", "banana", "orange", "apple", "grape", "banana", "apple"]
augmented_list = augment_list(generated_corpus, 0.5)




In [112]:
len(augmented_list)

1500

In [113]:
Counter(generated_corpus)

Counter({'xcftb': 28,
         'osoym': 25,
         'ubscz': 21,
         'lfrel': 20,
         'egfvo': 19,
         'vkczm': 19,
         'fhaul': 19,
         'umzgi': 18,
         'yjzis': 18,
         'imlvx': 18,
         'gqyot': 18,
         'gjmbm': 18,
         'akovc': 18,
         'kuabe': 18,
         'bspox': 18,
         'fynxy': 17,
         'ynepk': 17,
         'jbvqy': 16,
         'ntiks': 16,
         'uaume': 16,
         'ntaou': 16,
         'tkadm': 15,
         'yciee': 15,
         'jqlbn': 15,
         'exhii': 15,
         'ddpcb': 15,
         'eompb': 15,
         'ozyhd': 14,
         'jqfnq': 14,
         'qaooj': 14,
         'nliut': 14,
         'ruksa': 14,
         'aubdh': 13,
         'cpplt': 13,
         'sctfw': 13,
         'cpytr': 12,
         'pctsf': 12,
         'rolmd': 12,
         'csbcw': 12,
         'uirdc': 11,
         'yytls': 11,
         'srgzk': 11,
         'ppejm': 11,
         'hnzvl': 11,
         'tstku': 11,
         '

In [114]:
Counter(augmented_list)

Counter({'xcftb': 35,
         'osoym': 31,
         'egfvo': 28,
         'vkczm': 28,
         'imlvx': 27,
         'lfrel': 27,
         'bspox': 27,
         'ubscz': 26,
         'eompb': 25,
         'fynxy': 24,
         'fhaul': 23,
         'umzgi': 22,
         'yjzis': 22,
         'gqyot': 22,
         'ozyhd': 22,
         'irskg': 22,
         'gjmbm': 22,
         'jbvqy': 21,
         'akovc': 21,
         'yytls': 20,
         'tkadm': 20,
         'ntaou': 20,
         'yciee': 20,
         'kuabe': 20,
         'exhii': 20,
         'puvbr': 20,
         'jqfnq': 19,
         'pctsf': 19,
         'jqlbn': 19,
         'ruksa': 19,
         'ynepk': 19,
         'ddpcb': 19,
         'csbcw': 19,
         'aubdh': 18,
         'uaume': 18,
         'nliut': 18,
         'tstku': 18,
         'kbgfo': 18,
         'ntiks': 17,
         'qaooj': 17,
         'cpytr': 17,
         'hnzvl': 17,
         'lcaja': 16,
         'dzybm': 16,
         'cpplt': 16,
         '