<a href="https://colab.research.google.com/github/MK316/Spring2024/blob/main/Corpus/OF_CMU_240205.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Stressed vowels in the Oxford data (5K)

In [None]:
import pandas as pd

## [1] Read csv file

In [None]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('OF5K_data_trim.csv', encoding='utf-8')

# [2] Adding Pronunciation column for English words

### Step 1: Install the g2p_en Library

In [None]:
!pip install g2p_en

### Step 2: Create a Function to Get Phonetic Transcription

After installing the library, you can use it to create a function that returns the phonetic transcription for each English word.

+ CMU transcription
+ [Symbols](https://github.com/MK316/Workingpapers/blob/main/Foreignwords/cmusymbols.md)

In [None]:
from g2p_en import G2p

# Initialize the g2p object
g2p = G2p()

def get_phonetic_transcription(word):
    # Get the phonetic transcription of the word
    phonetic_transcription = g2p(word)
    # Join the phonemes into a single string
    return ' '.join(phonetic_transcription)


### Step 3: Apply the Function to Your DataFrame
Now, apply this function to the 'EWord' column of your DataFrame and save the results in a new column called 'Pronunciation'.

In [None]:
import pandas as pd

# Assuming df is your DataFrame and it has a column named 'EWord'
df['Pronunciation'] = df['WORD'].apply(get_phonetic_transcription)


In [None]:
df.head()

### Step 4: Save Your DataFrame
Finally, save your updated DataFrame with the new 'Pronunciation' column to a CSV file.


In [None]:
df.to_csv('OF5K_CMU.csv', index=False, encoding='utf-8-sig')


In [None]:
df.head()

# Primary stress and the vowel

In [None]:
import pandas as pd


# Function to find the primary stressed vowel
def find_primary_stress(pronunciation):
    parts = pronunciation.split()
    # Iterate through each part to find the primary stressed vowel (denoted by '1')
    for part in parts:
        if '1' in part:
            # Return the vowel part before the stress indicator '1'
            return part[:-1]  # Removes the stress number
    return ''  # In case no primary stress is found

# Apply the function to the 'Pronunciation' column to create a new 'Primary' column
df['Primary'] = df['Pronunciation'].apply(find_primary_stress)

# Display the updated DataFrame
print(df)


In [None]:
cmu_to_ipa = {
    'AA': 'ɑ',  # cot
    'AE': 'æ',  # cat
    'AH': 'ə',  # schwa, as in sofa ('S OW1 F AH0'), about ('AH0 B AW1 T')
    'AO': 'ɔ',  # caught, talk
    'AW': 'aʊ',  # cow
    'AY': 'aɪ',  # hide
    'EH': 'ɛ',  # dress
    'ER': 'ɝ',  # nurse (rhotic, with stress), 'ɚ' (rhotic, without stress)
    'EY': 'eɪ',  # face
    'IH': 'ɪ',  # kit
    'IY': 'i',  # fleece
    'OW': 'oʊ',  # goat
    'OY': 'ɔɪ',  # choice
    'UH': 'ʊ',  # foot
    'UW': 'u',  # goose
}

# Note on the schwa ('AH'):
# - 'AH' with stress '0' (AH0) is mapped to 'ə', indicating an unstressed schwa sound.
# - When 'AH' appears with stress (e.g., AH1, AH2), it may represent a stressed vowel that in some accents sounds more like 'ʌ' or another vowel, depending on context.

# Additional note on 'ER':
# - 'ER' is represented in IPA as 'ɝ' when stressed (ER1) and 'ɚ' when unstressed (ER0 or ER2), reflecting the rhotic nature of the American English schwa.


# Function to map CMU phoneme to IPA symbol
def cmu_to_ipa_mapper(cmu_phoneme):
    return cmu_to_ipa.get(cmu_phoneme, '')  # Return an empty string if not found

# Apply the function to the 'Primary' column to create a new 'IPA' column
df['IPA'] = df['Primary'].apply(cmu_to_ipa_mapper)

# Display the updated DataFrame
print(df)



In [None]:
df.to_csv('OF5K_primarystress.csv', index=False, encoding='utf-8-sig')


# Syllable boundaries

In [None]:
# Function to extract syllable-marked transcription
def extract_syllables(cmu_pronunciation):
    # Split the CMU pronunciation at each period to separate syllables
    syllables = cmu_pronunciation.split('. ')
    # Join the syllables with a space or another marker if needed
    syllable_marked_transcription = ' . '.join(syllables)  # Adding a space around periods for clarity
    return syllable_marked_transcription

# Apply the function to create a new column with syllable-marked transcriptions
df['Syllable_Marked_Transcription'] = df['Pronunciation'].apply(extract_syllables)

# Display the updated DataFrame
df.head()

# Data selection

Words containing a specific vowel

In [None]:
EY_df = df[df['Pronunciation'].str.contains(' EY')]
OW_df = df[df['Pronunciation'].str.contains(' OW')]
AY_df = df[df['Pronunciation'].str.contains(' AY')]
OY_df = df[df['Pronunciation'].str.contains(' OY')]
AW_df = df[df['Pronunciation'].str.contains(' AW')]
print('EY:', len(EY_df['Eword']))
print('OW:', len(OW_df['Eword']))
print('AY:', len(AY_df['Eword']))
print('OY:', len(OY_df['Eword']))
print('AW:', len(AW_df['Eword']))