In [1]:
from collections import Counter
import pandas as pd
from typing import Union


def calculate_frequency_ratios(
        population: Union[str, pd.Series], 
        sample: Union[str, pd.Series],
        q: float = None,
        binary_per_entry: bool = False) -> pd.Series:
    """
    Returns how much over/underrepresented a word is in a sample, relative to the population.
    important: only considers words in the population that also occur in the sample!
    outcome per word:
    inf -> word occurs in sample but not in population
    0   -> word occurs as often in the sample as in the population
    0+  -> word occurs more often in the sample than in the population
    0-  -> word occurs less often in the sample than in the population
    """
    if isinstance(sample, pd.Series):
        sample = sample.fillna("").astype(str).apply(lambda string: string.lower())
        if binary_per_entry:
            sample = sample.apply(lambda string: " ".join(list(set(string.split(" ")))))
        sample = sample.str.cat(sep=" ")
    if isinstance(population, pd.Series):
        population = population.fillna("").astype(str).apply(lambda string: string.lower())
        if binary_per_entry:
            population = population.apply(lambda string: " ".join(list(set(string.split(" ")))))
        population = population.str.cat(sep=" ")

    sample_counts = Counter(sample.split())
    total_sample = sum(sample_counts.values())

    population = " ".join(w for w in population.split(" ") if w in list(set(sample.split(" "))))
    population_counts = Counter(population.split())
    total_population = sum(population_counts.values())

    # difference of relative frequencies
    frequency_ratios = {}
    for word, sample_freq in sample_counts.items():
        population_freq = population_counts.get(word, 0)

        population_relative = population_freq / total_population
        sample_relative = sample_freq / total_sample

        if population_relative == 0:
            frequency_ratios[word] = None
        else:
            frequency_ratios[word] = sample_relative - population_relative
    frequency_ratios = pd.Series(frequency_ratios)

    # only keep over or underrepresented words if q is set
    if q is not None:
        max_val = frequency_ratios.max() + 1
        frequency_ratios.fillna(max_val)
        lower_bound = frequency_ratios.quantile(q)
        upper_bound = frequency_ratios.quantile(1-q)
        frequency_ratios = frequency_ratios[(frequency_ratios <= lower_bound) | (frequency_ratios >= upper_bound)]
        frequency_ratios.loc[frequency_ratios == max_val] = None
    return frequency_ratios.fillna(float("inf")).sort_values(ascending=False)

In [2]:
import pandas as pd
import random


words_list = [
    "apple", "banana", "orange", "grape", "peach", "mango", "berry", "kiwi", "melon", "plum", "cherry", "pear", 
    "lemon", "lime", "fig", "apricot", "nectarine", "pomegranate", "papaya", "passionfruit", "date", "guava", 
    "coconut", "pineapple", "persimmon", "starfruit", "rambutan", "lychee", "durian", "jackfruit", "tomato", 
    "cucumber", "bellpepper", "carrot", "lettuce", "spinach", "kale", "broccoli", "cauliflower", "zucchini", 
    "pumpkin", "squash", "eggplant", "radish", "beet", "turnip", "sweetpotato", "yam", "ginger", "garlic", 
    "onion", "shallot", "leek", "chive", "cilantro", "parsley", "basil", "mint", "rosemary", "thyme", 
    "oregano", "dill", "sage", "tarragon", "bayleaf", "lavender", "saffron", "vanilla", "cinnamon", "clove", 
    "nutmeg", "allspice", "peppercorn", "mustardseed", "fennel", "coriander", "cumin", "turmeric", "paprika", 
    "chili", "anise", "cardamom", "mace", "juniper", "sesame", "flaxseed", "poppyseed", "pumpkinseed", 
    "sunflowerseed", "almond", "peanut", "walnut", "pecan", "hazelnut", "cashew", "macadamia", "pistachio", 
    "brazilnut", "chestnut", "acorn", "quinoa", "rice", "barley", "oat", "wheat", "rye", "corn", "millet", 
    "buckwheat", "spelt", "amaranth", "sorghum", "teff", "polenta", "couscous", "bulgur", "farro", "kamut", 
    "triticale", "semolina", "feta", "mozzarella", "cheddar", "parmesan", "gouda", "brie", "camembert", 
    "ricotta", "mascarpone", "creamcheese", "bluecheese", "roquefort", "stilton", "goatcheese", "havarti", 
    "munster", "provolone", "swiss", "gruyere", "comte", "pecorino", "romano", "asiago", "manchego", "emmental", 
    "halloumi", "paneer", "tofu", "tempeh", "seitan", "edamame", "lentil", "chickpea", "kidneybean", 
    "blackbean", "pinto", "navybean", "mungbean", "soybean", "fava", "lima", "butterbean", "splitpea", 
    "greenbean", "snappea", "snowpea", "runnerbean", "adzuki", "peanutbutter", "almondbutter", "tahini", 
    "honey", "maplesyrup", "molasses", "jelly", "jam", "preserves", "marmalade", "butter", "margarine"
]

population = pd.Series(random.choices(words_list, k=200))
sample = pd.Series(random.choices(words_list[:10], k=10))
result = calculate_frequency_ratios(population=population, sample=sample)
result

grape          inf
kiwi      0.033333
berry     0.033333
peach     0.033333
apple    -0.066667
banana   -0.066667
melon    -0.066667
dtype: float64