**Importing Libraries**

In [3]:
import pandas as pd
import unicodedata

In [13]:
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns (just in case)
pd.set_option('display.width', None)  # No max width limit
pd.set_option('display.max_colwidth', None)  # Show full column contents

## Loading Data

In [3]:
df = pd.read_csv('../data/cleaned/cree_dataset.csv')

In [4]:
df.head()

Unnamed: 0,word,translation,audio
0,acahkosak,stars,acahkosak.mp3
1,achimēwak,They are telling a story about him.,achimēwak.mp3
2,achimoh,Tell a story!,achimoh.mp3
3,achimostamawâw,A story is told to him/her,achimostamawâw.mp3
4,achimostaw,Tell him a story.,achimostaw.mp3


In [5]:
df.drop('audio', axis=1, inplace=True)
df.head()

Unnamed: 0,word,translation
0,acahkosak,stars
1,achimēwak,They are telling a story about him.
2,achimoh,Tell a story!
3,achimostamawâw,A story is told to him/her
4,achimostaw,Tell him a story.


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 992 entries, 0 to 991
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   word         992 non-null    object
 1   translation  992 non-null    object
dtypes: object(2)
memory usage: 15.6+ KB


In [None]:
# Rename the 'word' column to 'Cree' for clarity and consistency
df.rename(columns={'word': 'Cree'}, inplace=True)

# Rename the 'translation' column to 'English' to clearly indicate English meanings
df.rename(columns={'translation': 'English'}, inplace=True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 992 entries, 0 to 991
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Cree     992 non-null    object
 1   English  992 non-null    object
dtypes: object(2)
memory usage: 15.6+ KB


In [9]:
df.head()

Unnamed: 0,Cree,English
0,acahkosak,stars
1,achimēwak,They are telling a story about him.
2,achimoh,Tell a story!
3,achimostamawâw,A story is told to him/her
4,achimostaw,Tell him a story.


In [10]:
# Count rows with leading/trailing whitespace in 'Cree'
cree_whitespace_count = (df['Cree'] != df['Cree'].str.strip()).sum()

# Count rows with leading/trailing whitespace in 'English'
english_whitespace_count = (df['English'] != df['English'].str.strip()).sum()

print(f"Rows with whitespace in 'Cree': {cree_whitespace_count}")
print(f"Rows with whitespace in 'English': {english_whitespace_count}")

Rows with whitespace in 'Cree': 0
Rows with whitespace in 'English': 0


In [11]:
# Check if there are any uppercase letters in the 'English' column
has_upper_english = df['English'].str.contains(r'[A-Z]').any()
print(f"Any uppercase letters in English? {has_upper_english}")
print(df[df['English'].str.contains(r'[A-Z]')])


Any uppercase letters in English? True
                        Cree  \
1                  achimēwak   
2                    achimoh   
3             achimostamawâw   
4                 achimostaw   
5               achimostawāw   
6               achimostawēw   
7             achimostawēwak   
8              achimostawihk   
9               achimostawik   
10              achimostawin   
11            achimostawinan   
12                   achimâw   
13                 achiwinam   
14                achiwpayin   
15               achiwīpayiw   
16                      acim   
17                acimosisak   
18                    acosis   
20               acoskēwinis   
22                   ahcanis   
23                 ahcānisak   
24                     ahchi   
26                 ahchipiko   
27                      ahēw   
28                    ahēwak   
29                      ahih   
30                     ahihk   
31                      ahik   
32                      ahin   
3

In [None]:
# Convert all English translations to lowercase for consistency
df['English'] = df['English'].str.lower()
# Check if any English entries still contain uppercase letters (should be none after lowercasing)
has_upper_english = df['English'].str.contains(r'[A-Z]').any()
print(f"Any uppercase letters in English? {has_upper_english}")
# Display rows where English translations still contain uppercase letters (if any)
print(df[df['English'].str.contains(r'[A-Z]')])


Any uppercase letters in English? True
Empty DataFrame
Columns: [Cree, English]
Index: []


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 992 entries, 0 to 991
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Cree     992 non-null    object
 1   English  992 non-null    object
dtypes: object(2)
memory usage: 15.6+ KB


In [None]:
# Save the cleaned and normalized dataframe to a CSV file without row indices
df.to_csv('../data/cleaned/cree_english_text_only.csv', index=False)

## Text Only (Translation)

In [None]:
# Load the cleaned Cree-English dataset CSV with UTF-8 encoding to properly handle special characters
df = pd.read_csv("../data/cleaned/cree_english_text_only.csv", encoding="utf-8")

In [None]:
def print_rows_with_parentheses(df):
    """
    Find and print all rows where the English translation contains parentheses.
    This helps identify entries that may have extra explanations or multiple meanings.
    """
    # Filter rows where 'English' column contains '(' or ')'
    rows_with_parens = df[df['English'].str.contains(r'\(|\)', regex=True)]
    
    # Print those rows
    print(rows_with_parens)
    
    # Print total count of such rows
    print(f"\nTotal rows with parentheses: {len(rows_with_parens)}")

# Call the function to display rows with parentheses in English translations
print_rows_with_parentheses(df)

                 Cree                    English
219        ayēkipēsim          april (frog moon)
397    iyikopiwipēsim      november (frost moon)
464  kaskatinōwipēsim    october (freezing moon)
602           kinosēw            fish (one fish)
603         kinosēwak              fish (plural)
751     mikisowipēsim      february (eagle moon)
876        niskipēsim         march (goose moon)
901      ohphōwipēsim       august (flying moon)
909  onōcihitowipēsim    september (mating moon)
910   opaskawēhopēsim  june (egg hatching month)
975             pēhok              wait (plural)

Total rows with parentheses: 11


In [None]:
def clean_text(text):
    """
    Normalize and clean a given text string.
    
    - Applies Unicode Normalization Form KC (NFKC) to standardize characters,
      e.g., converting full-width characters to their standard ASCII equivalents.
    - Strips leading and trailing whitespace.
    
    Args:
        text (str): The input text string to clean.
        
    Returns:
        str: The cleaned and normalized text.
    """
    text = unicodedata.normalize("NFKC", text)
    return text.strip()


In [None]:
# Apply the clean_text function to normalize and trim whitespace in the 'Cree' column
df['Cree'] = df['Cree'].apply(clean_text)
# Apply the clean_text function to normalize and trim whitespace in the 'English' column
df['English'] = df['English'].apply(clean_text)


In [8]:
df.head()

Unnamed: 0,Cree,English
0,acahkosak,stars
1,achimēwak,they are telling a story about him
2,achimoh,tell a story
3,achimostamawâw,a story is told to him/her
4,achimostaw,tell him a story


In [None]:
# Save the cleaned and normalized DataFrame to CSV format
# File is saved with UTF-8 encoding to preserve special characters
df.to_csv("../data/cleaned/NKFC_normalized_cree_english.csv", index=False, encoding="utf-8")