<a href="https://colab.research.google.com/github/MK316/Myapps/blob/main/data/OF5K_stress.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Stress positions of Oxford Learners' Vocabulary 5K

+ OF5K adds 2K to the previous OF3K.
+ CERF levels are B2 and C1 (2000 words)
+ Use CMU dictionary to find information on syllable counting and stress position.
+ [OF5K data set]("https://raw.githubusercontent.com/MK316/Spring2024/main/data/OF5K.csv")


In [None]:
!pip install nltk pandas

In [None]:
url = "https://raw.githubusercontent.com/MK316/Spring2024/main/data/OF5K.csv"
df = pd.read_csv(url, encoding="utf-8")
df.head()

# Using CMU dictionary, get the stress information (position) and add a column to the data

In [None]:
import pandas as pd
import nltk

# Ensure the CMU dictionary is available
nltk.download('cmudict')
from nltk.corpus import cmudict

# Example dataframe
url = "https://raw.githubusercontent.com/MK316/Spring2024/main/data/OF5K.csv"
df = pd.read_csv(url, encoding="utf-8")

def stress_position(word):
    cmu_dict = cmudict.dict()
    stress_positions = {'Final': -1, 'Penultimate': -2, 'Antepenultimate': -3, 'Initial': 1}
    try:
        phonemes = cmu_dict[word.lower()][0]  # Take the first pronunciation variant if multiple
        stress_indices = [i for i, ph in enumerate(phonemes) if '1' in ph]

        if not stress_indices:
            return 'No stress information'

        # Get the position of the primary stress in relation to the end of the word
        stress_pos = stress_indices[0] - len(phonemes)

        for key, value in stress_positions.items():
            if stress_pos == value:
                return key
        if len(phonemes) > 4 and stress_pos > stress_positions['Initial']:
            return 'Initial'
    except KeyError:
        return 'Word not found in CMU dictionary'

    return 'Other'

# Apply the function to the dataframe
df['STRESS'] = df['WORD'].apply(stress_position)

# Display the updated dataframe
print(df)


In [None]:
df.to_csv('OF5K_stressinfo.csv',encoding='utf-8',index=False)

# Add number of syllables to the data

In [None]:
def count_syllables(word):
    cmu_dict = cmudict.dict()
    try:
        # Taking the first pronunciation variant if there are multiple
        phonemes = cmu_dict[word.lower()][0]
        # Counting syllables as digits in phonemes
        syllables = sum(ph.endswith(('0', '1', '2')) for ph in phonemes)
        return syllables
    except KeyError:
        # Returning NaN or some default value if word is not found
        return float('nan')


In [None]:
count_syllables("certainty")

Upload file: OF5K_stressinfo.csv

In [None]:
# Install required packages (uncomment if needed)
# !pip install nltk pandas

import pandas as pd
import nltk

# Ensure the CMU dictionary is available
nltk.download('cmudict')
from nltk.corpus import cmudict

# Example dataframe
df = pd.read_csv("/content/OF5K_stressinfo.csv", encoding = "utf-8")
# df = pd.DataFrame(data)

def count_syllables(word):
    cmu_dict = cmudict.dict()
    try:
        # Taking the first pronunciation variant if there are multiple
        phonemes = cmu_dict[word.lower()][0]
        # Counting syllables as digits in phonemes
        syllables = sum(ph.endswith(('0', '1', '2')) for ph in phonemes)
        return syllables
    except KeyError:
        # Returning NaN or some default value if word is not found
        return float('nan')

# Apply the function to the dataframe
df['Nsyll'] = df['WORD'].apply(count_syllables)

# Display the updated dataframe
df.head()


In [None]:
df.to_csv("OF5K_nsyll.csv",encoding="utf-8", index=False)