<a href="https://colab.research.google.com/github/MK316/Engpro-Class/blob/main/data/tapping_process.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from google.colab import files

# 1. Upload your CSV file
uploaded = files.upload()
filename = list(uploaded.keys())[0]

# 2. Read using UTF-8 encoding
df = pd.read_csv(filename, encoding='utf-8')

# 3. Filter words that contain 't' or 'd'
filtered_df = df[df['WORD'].str.contains(r'[td]', case=False, na=False)].copy()

# 4. Add Target and Position columns
def detect_target(word):
    word = word.lower()
    if 't' in word:
        return 'T'
    elif 'd' in word:
        return 'D'
    return ''

def detect_position(word):
    word = word.lower()
    if word.startswith(('t', 'd')):
        return 'Initial'
    elif word.endswith(('t', 'd')):
        return 'Final'
    else:
        return 'Medial'

filtered_df['Target'] = filtered_df['WORD'].apply(detect_target)
filtered_df['Position'] = filtered_df['WORD'].apply(detect_position)

# 5. Save with utf-8-sig to preserve IPA in Excel
output_filename = "tapping.csv"
filtered_df.to_csv(output_filename, index=False, encoding='utf-8-sig')

# 6. Download
files.download(output_filename)


# t/d between two vowels

In [None]:
import pandas as pd
from google.colab import files

# 1. Upload CSV file
uploaded = files.upload()
filename = list(uploaded.keys())[0]

# 2. Load using UTF-8
df = pd.read_csv(filename, encoding='utf-8')

# 3. Filter words: (t or d) between vowels, but exclude 'tion'
vowel_td_vowel = df['WORD'].str.contains(r'[aeiouAEIOU][td][aeiouAEIOU]', na=False)
not_tion = ~df['WORD'].str.contains(r'tion', case=False, na=False)
filtered_df = df[vowel_td_vowel & not_tion].copy()

# 4. Create expanded rows based on whether word contains t, d, or both
expanded_rows = []

def detect_position(word, letter):
    word = word.lower()
    if word.startswith(letter):
        return 'Initial'
    elif word.endswith(letter):
        return 'Final'
    else:
        return 'Medial'

for _, row in filtered_df.iterrows():
    word = row['WORD'].lower()
    has_t = 't' in word
    has_d = 'd' in word

    if has_t and has_d:
        # create two rows
        row_t = row.copy()
        row_t['Target'] = 'T'
        row_t['Position'] = detect_position(word, 't')
        expanded_rows.append(row_t)

        row_d = row.copy()
        row_d['Target'] = 'D'
        row_d['Position'] = detect_position(word, 'd')
        expanded_rows.append(row_d)

    elif has_t:
        row['Target'] = 'T'
        row['Position'] = detect_position(word, 't')
        expanded_rows.append(row)

    elif has_d:
        row['Target'] = 'D'
        row['Position'] = detect_position(word, 'd')
        expanded_rows.append(row)

# 5. Convert to DataFrame
final_df = pd.DataFrame(expanded_rows)

# 6. Save as utf-8-sig to preserve IPA
output_filename = 'tapping_expanded.csv'
final_df.to_csv(output_filename, index=False, encoding='utf-8-sig')

# 7. Download the result
files.download(output_filename)


In [None]:
import pandas as pd
from google.colab import files
import re

# 1. Upload CSV file
uploaded = files.upload()
filename = list(uploaded.keys())[0]

# 2. Load using UTF-8
df = pd.read_csv(filename, encoding='utf-8')

# 3. Filter words: (t or d) between vowels, but exclude 'tion'
vowel_td_vowel = df['WORD'].str.contains(r'[aeiouAEIOU][td][aeiouAEIOU]', na=False)
not_tion = ~df['WORD'].str.contains(r'tion', case=False, na=False)
filtered_df = df[vowel_td_vowel & not_tion].copy()

# 4. Expand rows if word contains both 't' and 'd'
expanded_rows = []

def detect_position(word, letter):
    word = word.lower()
    if word.startswith(letter):
        return 'Initial'
    elif word.endswith(letter):
        return 'Final'
    else:
        return 'Medial'

def is_tapping_context(word, letter):
    # Match any case of a vowel + t/d + vowel
    pattern = rf'[aeiou][{letter}][aeiou]'
    return bool(re.search(pattern, word.lower()))

for _, row in filtered_df.iterrows():
    word = row['WORD'].lower()
    has_t = 't' in word
    has_d = 'd' in word

    if has_t and has_d:
        for letter in ['t', 'd']:
            new_row = row.copy()
            new_row['Target'] = letter.upper()
            new_row['Position'] = detect_position(word, letter)
            new_row['Tapping'] = "YES" if is_tapping_context(word, letter) else "NO"
            expanded_rows.append(new_row)

    elif has_t:
        row['Target'] = 'T'
        row['Position'] = detect_position(word, 't')
        row['Tapping'] = "YES" if is_tapping_context(word, 't') else "NO"
        expanded_rows.append(row)

    elif has_d:
        row['Target'] = 'D'
        row['Position'] = detect_position(word, 'd')
        row['Tapping'] = "YES" if is_tapping_context(word, 'd') else "NO"
        expanded_rows.append(row)

# 5. Build DataFrame and export
final_df = pd.DataFrame(expanded_rows)
output_filename = 'tapping_with_flag.csv'
final_df.to_csv(output_filename, index=False, encoding='utf-8-sig')

# 6. Download file
files.download(output_filename)
