In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pickle
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

In [2]:
class CreeLearningModel:
    def __init__(self):
        self.vectorizer = TfidfVectorizer(
            analyzer='char',
            ngram_range=(1, 3),
            max_features=1000
        )
        self.mlb = MultiLabelBinarizer()
        self.similarity_model = None
        self.cree_to_english = defaultdict(list)
        self.english_to_cree = defaultdict(list)
        self.cree_embeddings = None
        self.english_embeddings = None


* TF-IDF vectorizer 
    - to convert Cree and English words into numeric vectors
    - Uses character-level n-grams (1 to 3 characters) to capture sub-word patterns, which is helpful for handling spelling variations and morphological structures.
    - Limit to top 1000 character n-grams for efficiency
* MultiLabelBinarizer
    - used to encode multiple possible English translations for a single Cree word into a binary format suitable for multi-label classification.
* similarity_model 
    - similarity model used to find the closest English match to a given Cree word.
* cree_to_english and english_to_cree
    - used to convert between Cree and English words, using the similarity model to find the closest match.
* cree_embeddings and english_embeddings
    - store the numeric TF-IDF vector representations (embeddings) for all Cree and English words, to be used in similarity comparisons.

**Preprocessing Dataset**

In [None]:
def preprocess_data(self, csv_file_path):
        """
        Preprocess the Cree-English dataset
        """
        print("Loading and preprocessing data...")
        
        # Load the CSV file
        df = pd.read_csv(csv_file_path)
        print(f"Original dataset shape: {df.shape}")
        
        # Clean the data
        df['Cree'] = df['Cree'].str.strip()
        df['English'] = df['English'].str.strip()
        
        # Remove any empty rows
        df = df.dropna()
        
        # Create mappings
        for _, row in df.iterrows():
            cree_word = row['Cree'].lower()
            english_meaning = row['English'].lower()
            
            if english_meaning not in self.cree_to_english[cree_word]:
                # a Cree word maps to a list of English meanings
                self.cree_to_english[cree_word].append(english_meaning)
            
            if cree_word not in self.english_to_cree[english_meaning]:
                # an English word maps to a list of Cree words
                self.english_to_cree[english_meaning].append(cree_word)
        
        print(f"Unique Cree words: {len(self.cree_to_english)}")
        print(f"Unique English meanings: {len(self.english_to_cree)}")
        
        # Analyze one-to-many mappings
        multi_meaning_cree = {k: v for k, v in self.cree_to_english.items() if len(v) > 1}
        print(f"Cree words with multiple meanings: {len(multi_meaning_cree)}")
        
        # Show some examples
        print("\nExamples of Cree words with multiple meanings:")
        for i, (cree, meanings) in enumerate(list(multi_meaning_cree.items())[:5]):
            print(f"  {cree}: {', '.join(meanings)}")
        
        return df

# Attach it to the class
CreeLearningModel.preprocess_data = preprocess_data

In [17]:
model = CreeLearningModel()

In [18]:
csv_file = 'D:\.SeaThru\Work\AI-for-Indigenous-Language--Revitalization-in-Canada\AI-for-Indigenous-Language-Revitalization-in-Canada\data\cleaned\cree_english_text_only.csv'  # Update this path
df = model.preprocess_data(csv_file)

Loading and preprocessing data...
Original dataset shape: (1188, 2)
Unique Cree words: 999
Unique English meanings: 1099
Cree words with multiple meanings: 147

Examples of Cree words with multiple meanings:
  achiwpayin: it shrunk, it shrinks
  ahin: put, place me
  ahpō: or, either
  akihtāsowin: a number, a figure
  akihtāsōna: numbers, figures


In [22]:
df.head()

Unnamed: 0,Cree,English
0,acahkosak,stars
1,achimēwak,they are telling a story about him
2,achimoh,tell a story
3,achimostamawâw,a story is told to him/her
4,achimostaw,tell him a story


**Creating Embeddings**

In [5]:
def create_embeddings(self, df):
        """
        Create TF-IDF embeddings for Cree words and English meanings
        """
        print("\nCreating embeddings...")
        
        # Get unique words and meanings
        unique_cree = list(self.cree_to_english.keys())
        unique_english = list(self.english_to_cree.keys())
        
        # Create embeddings for Cree words
        self.cree_embeddings = self.vectorizer.fit_transform(unique_cree)
        
        # Create embeddings for English meanings (using same vectorizer)
        self.english_embeddings = self.vectorizer.transform(unique_english)
        
        print(f"Cree embeddings shape: {self.cree_embeddings.shape}")
        print(f"English embeddings shape: {self.english_embeddings.shape}")
        
        return unique_cree, unique_english

# Attach it to the class
CreeLearningModel.create_embeddings = create_embeddings

In [23]:
unique_cree, unique_english = model.create_embeddings(df)


Creating embeddings...
Cree embeddings shape: (999, 1000)
English embeddings shape: (1099, 1000)


**Building Similarity Model**

In [6]:
def build_similarity_model(self):
        """
        Build a similarity-based model for translation
        """
        print("\nBuilding similarity model...")
        
        # Calculate similarity matrix between Cree and English
        self.similarity_matrix = cosine_similarity(self.cree_embeddings, self.english_embeddings)
        print(f"Similarity matrix shape: {self.similarity_matrix.shape}")

# Attach it to the class
CreeLearningModel.build_similarity_model = build_similarity_model

In [26]:
model.build_similarity_model()


Building similarity model...
Similarity matrix shape: (999, 1099)


**Finding Translations**

In [7]:
def find_translations(self, cree_word, top_k=5):
        """
        Find top-k English translations for a Cree word
        """
        cree_word = cree_word.lower().strip()
        
        # Direct lookup first
        if cree_word in self.cree_to_english:
            return self.cree_to_english[cree_word]
        
        # If not found, use similarity
        unique_cree = list(self.cree_to_english.keys())
        unique_english = list(self.english_to_cree.keys())
        
        if cree_word not in unique_cree:
            # Find most similar Cree word
            query_embedding = self.vectorizer.transform([cree_word])
            similarities = cosine_similarity(query_embedding, self.cree_embeddings)[0]
            
            # Get top similar Cree words
            top_indices = similarities.argsort()[-top_k:][::-1]
            similar_cree_words = [unique_cree[i] for i in top_indices if similarities[i] > 0.1]
            
            # Get their translations
            translations = []
            for similar_word in similar_cree_words:
                translations.extend(self.cree_to_english[similar_word])
            
            return list(set(translations))[:top_k]
        
        return []

# Attach it to the class
CreeLearningModel.find_translations = find_translations

In [27]:
print("\n=== Testing the Model ===")
        
# Test translation lookup
test_words = ['ahēw', 'ahin', 'ahpō']
for word in test_words:
    translations = model.find_translations(word)
    print(f"'{word}' -> {translations}")


=== Testing the Model ===
'ahēw' -> ['he placed him']
'ahin' -> ['put', 'place me']
'ahpō' -> ['or', 'either']


**Finding Cree Words**

In [8]:
def find_cree_words(self, english_meaning, top_k=5):
        """
        Find Cree words for an English meaning
        """
        english_meaning = english_meaning.lower().strip()
        
        # Direct lookup first
        if english_meaning in self.english_to_cree:
            return self.english_to_cree[english_meaning]
        
        # If not found, use similarity
        unique_english = list(self.english_to_cree.keys())
        
        if english_meaning not in unique_english:
            # Find most similar English meaning
            query_embedding = self.vectorizer.transform([english_meaning])
            similarities = cosine_similarity(query_embedding, self.english_embeddings)[0]
            
            # Get top similar English meanings
            top_indices = similarities.argsort()[-top_k:][::-1]
            similar_meanings = [unique_english[i] for i in top_indices if similarities[i] > 0.1]
            
            # Get their Cree translations
            cree_words = []
            for similar_meaning in similar_meanings:
                cree_words.extend(self.english_to_cree[similar_meaning])
            
            return list(set(cree_words))[:top_k]
        
        return []

# Attach it to the class
CreeLearningModel.find_cree_words = find_cree_words

In [30]:
print("\nReverse lookup:")
test_meanings = ['place him', 'or', 'this']
for meaning in test_meanings:
    cree_words = model.find_cree_words(meaning)
    print(f"'{meaning}' -> {cree_words}")


Reverse lookup:
'place him' -> ['ahih']
'or' -> ['ahpō']
'this' -> ['ōma']


**Creating Learning Exercises**

In [9]:
def create_learning_exercises(self, difficulty='mixed'):
        """
        Create learning exercises based on the dataset
        """
        exercises = []
        
        if difficulty == 'easy':
            # Single meaning words only
            words = {k: v for k, v in self.cree_to_english.items() if len(v) == 1}
        elif difficulty == 'hard':
            # Multiple meaning words only
            words = {k: v for k, v in self.cree_to_english.items() if len(v) > 1}
        else:
            # Mixed difficulty
            words = self.cree_to_english
        
        # Multiple choice exercises
        word_list = list(words.keys())
        np.random.shuffle(word_list)
        
        for cree_word in word_list[:20]:  # Create 20 exercises
            correct_answers = words[cree_word]
            
            # Get wrong answers
            all_meanings = list(self.english_to_cree.keys())
            wrong_answers = [m for m in all_meanings if m not in correct_answers]
            wrong_choices = np.random.choice(wrong_answers, min(3, len(wrong_answers)), replace=False)
            
            # Create multiple choice
            choices = list(correct_answers) + list(wrong_choices)
            np.random.shuffle(choices)
            
            exercises.append({
                'cree_word': cree_word,
                'choices': choices,
                'correct_answers': correct_answers,
                'type': 'multiple_choice'
            })
        
        return exercises

# Attach it to the class
CreeLearningModel.create_learning_exercises = create_learning_exercises

In [34]:
print("\n=== Sample Learning Exercises ===")
exercises = model.create_learning_exercises(difficulty='mixed')
        
for i, exercise in enumerate(exercises[:3]):  # Show first 3 exercises
    print(f"\nExercise {i+1}:")
    print(f"What does '{exercise['cree_word']}' mean?")
    for j, choice in enumerate(exercise['choices'], 1):
        print(f"  {j}. {choice}")
    print(f"Correct answer(s): {exercise['correct_answers']}")


=== Sample Learning Exercises ===

Exercise 1:
What does 'pōsik' mean?
  1. hurry up
  2. a number
  3. laugh
  4. all of you get on board
Correct answer(s): ['all of you get on board']

Exercise 2:
What does 'kakwēcihkēmōwin' mean?
  1. inquiring
  2. treat him with respect
  3. asking
  4. rocks
  5. redirectional response
Correct answer(s): ['asking', 'inquiring']

Exercise 3:
What does 'ponihtā' mean?
  1. a forest
  2. leave it alone
  3. this is a particle towards
  4. put
Correct answer(s): ['leave it alone']


**Saving Model**

In [10]:
def save_model(self, filepath='cree_model.pkl'):
        """
        Save the trained model
        """
        model_data = {
            'vectorizer': self.vectorizer,
            'cree_to_english': dict(self.cree_to_english),
            'english_to_cree': dict(self.english_to_cree),
            'similarity_matrix': self.similarity_matrix if hasattr(self, 'similarity_matrix') else None
        }
        
        with open(filepath, 'wb') as f:
            pickle.dump(model_data, f)
        
        print(f"Model saved to {filepath}")

# Attach it to the class
CreeLearningModel.save_model = save_model

In [35]:
model.save_model('../models/cree_learning_model.pkl')

Model saved to ../models/cree_learning_model.pkl


**Load Model**

In [11]:
def load_model(self, filepath='cree_model.pkl'):
        """
        Load a saved model
        """
        with open(filepath, 'rb') as f:
            model_data = pickle.load(f)
        
        self.vectorizer = model_data['vectorizer']
        self.cree_to_english = defaultdict(list, model_data['cree_to_english'])
        self.english_to_cree = defaultdict(list, model_data['english_to_cree'])
        self.similarity_matrix = model_data.get('similarity_matrix')
        
        print(f"Model loaded from {filepath}")

# Attach it to the class
CreeLearningModel.load_model = load_model

**Evaluate Model**

In [12]:
def evaluate_model(self, test_ratio=0.2, random_state=42):
        """
        Comprehensive model evaluation with multiple metrics
        """
        print("\n=== Model Evaluation ===")
        
        # Prepare data for evaluation
        all_cree_words = list(self.cree_to_english.keys())
        
        # Split data for evaluation
        train_words, test_words = train_test_split(
            all_cree_words, 
            test_size=test_ratio, 
            random_state=random_state
        )
        
        print(f"Training words: {len(train_words)}")
        print(f"Testing words: {len(test_words)}")
        
        # Evaluation metrics
        evaluation_results = {
            'exact_match_accuracy': 0,
            'partial_match_accuracy': 0,
            'top_3_accuracy': 0,
            'top_5_accuracy': 0,
            'average_similarity_score': 0,
            'coverage_score': 0,
            'multilingual_precision': 0,
            'multilingual_recall': 0,
            'multilingual_f1': 0
        }
        
        exact_matches = 0
        partial_matches = 0
        top_3_matches = 0
        top_5_matches = 0
        similarity_scores = []
        covered_words = 0
        
        # Detailed results for analysis
        detailed_results = []
        
        for cree_word in test_words:
            true_meanings = set(self.cree_to_english[cree_word])
            predicted_meanings = self.find_translations(cree_word, top_k=5)
            predicted_set = set(predicted_meanings)
            
            # Exact match (all meanings predicted correctly)
            if true_meanings == predicted_set:
                exact_matches += 1
            
            # Partial match (at least one meaning predicted correctly)
            if len(true_meanings.intersection(predicted_set)) > 0:
                partial_matches += 1
            
            # Top-k accuracy
            if len(true_meanings.intersection(set(predicted_meanings[:3]))) > 0:
                top_3_matches += 1
            if len(true_meanings.intersection(set(predicted_meanings[:5]))) > 0:
                top_5_matches += 1
            
            # Coverage (model found the word)
            if len(predicted_meanings) > 0:
                covered_words += 1
            
            # Similarity score
            if len(predicted_meanings) > 0:
                # Calculate Jaccard similarity
                jaccard_sim = len(true_meanings.intersection(predicted_set)) / len(true_meanings.union(predicted_set))
                similarity_scores.append(jaccard_sim)
            else:
                similarity_scores.append(0)
            
            # Store detailed results
            detailed_results.append({
                'cree_word': cree_word,
                'true_meanings': list(true_meanings),
                'predicted_meanings': predicted_meanings,
                'exact_match': true_meanings == predicted_set,
                'partial_match': len(true_meanings.intersection(predicted_set)) > 0,
                'jaccard_similarity': similarity_scores[-1]
            })
        
        # Calculate final scores
        n_test = len(test_words)
        evaluation_results['exact_match_accuracy'] = exact_matches / n_test
        evaluation_results['partial_match_accuracy'] = partial_matches / n_test
        evaluation_results['top_3_accuracy'] = top_3_matches / n_test
        evaluation_results['top_5_accuracy'] = top_5_matches / n_test
        evaluation_results['average_similarity_score'] = np.mean(similarity_scores)
        evaluation_results['coverage_score'] = covered_words / n_test
        
        # Multi-label classification metrics
        y_true_multilabel = []
        y_pred_multilabel = []
        
        # Get all possible English meanings for encoding
        all_english_meanings = list(self.english_to_cree.keys())
        
        for result in detailed_results:
            # True labels (binary vector)
            true_vector = [1 if meaning in result['true_meanings'] else 0 for meaning in all_english_meanings]
            y_true_multilabel.append(true_vector)
            
            # Predicted labels (binary vector)
            pred_vector = [1 if meaning in result['predicted_meanings'] else 0 for meaning in all_english_meanings]
            y_pred_multilabel.append(pred_vector)
        
        # Calculate precision, recall, F1 for multi-label scenario
        if len(y_true_multilabel) > 0:
            evaluation_results['multilingual_precision'] = precision_score(
                y_true_multilabel, y_pred_multilabel, average='micro', zero_division=0
            )
            evaluation_results['multilingual_recall'] = recall_score(
                y_true_multilabel, y_pred_multilabel, average='micro', zero_division=0
            )
            evaluation_results['multilingual_f1'] = f1_score(
                y_true_multilabel, y_pred_multilabel, average='micro', zero_division=0
            )
        
        # Print evaluation results
        print("\n--- Evaluation Results ---")
        print(f"Exact Match Accuracy: {evaluation_results['exact_match_accuracy']:.3f}")
        print(f"Partial Match Accuracy: {evaluation_results['partial_match_accuracy']:.3f}")
        print(f"Top-3 Accuracy: {evaluation_results['top_3_accuracy']:.3f}")
        print(f"Top-5 Accuracy: {evaluation_results['top_5_accuracy']:.3f}")
        print(f"Average Similarity Score: {evaluation_results['average_similarity_score']:.3f}")
        print(f"Coverage Score: {evaluation_results['coverage_score']:.3f}")
        print(f"Multi-label Precision: {evaluation_results['multilingual_precision']:.3f}")
        print(f"Multi-label Recall: {evaluation_results['multilingual_recall']:.3f}")
        print(f"Multi-label F1-Score: {evaluation_results['multilingual_f1']:.3f}")
        
        # Show some examples
        print("\n--- Sample Predictions ---")
        for i in range(min(5, len(detailed_results))):
            result = detailed_results[i]
            print(f"\nCree: '{result['cree_word']}'")
            print(f"  True: {result['true_meanings']}")
            print(f"  Predicted: {result['predicted_meanings']}")
            print(f"  Exact Match: {result['exact_match']}")
            print(f"  Similarity: {result['jaccard_similarity']:.3f}")
        
        # Error analysis
        print("\n--- Error Analysis ---")
        failed_predictions = [r for r in detailed_results if not r['partial_match']]
        print(f"Failed predictions: {len(failed_predictions)}")
        
        if failed_predictions:
            print("Examples of failed predictions:")
            for i in range(min(3, len(failed_predictions))):
                result = failed_predictions[i]
                print(f"  '{result['cree_word']}': {result['true_meanings']} -> {result['predicted_meanings']}")
        
        return evaluation_results, detailed_results

# Attach it to the class
CreeLearningModel.evaluate_model = evaluate_model

In [36]:
eval_results, detailed_results = model.evaluate_model(test_ratio=0.2)


=== Model Evaluation ===
Training words: 799
Testing words: 200

--- Evaluation Results ---
Exact Match Accuracy: 1.000
Partial Match Accuracy: 1.000
Top-3 Accuracy: 1.000
Top-5 Accuracy: 1.000
Average Similarity Score: 1.000
Coverage Score: 1.000
Multi-label Precision: 1.000
Multi-label Recall: 1.000
Multi-label F1-Score: 1.000

--- Sample Predictions ---

Cree: 'kāyān'
  True: ['you can have it']
  Predicted: ['you can have it']
  Exact Match: True
  Similarity: 1.000

Cree: 'ōskēsēhōwin'
  True: ['wearing new clothes']
  Predicted: ['wearing new clothes']
  Exact Match: True
  Similarity: 1.000

Cree: 'ākosimowin'
  True: ['nestling']
  Predicted: ['nestling']
  Exact Match: True
  Similarity: 1.000

Cree: 'ihtakon'
  True: ['it exists']
  Predicted: ['it exists']
  Exact Match: True
  Similarity: 1.000

Cree: 'nisitohta'
  True: ['understand it']
  Predicted: ['understand it']
  Exact Match: True
  Similarity: 1.000

--- Error Analysis ---
Failed predictions: 0


In [42]:
print("\n=== Final Model Performance Summary ===")
print(f"📊 Overall Accuracy: {eval_results['partial_match_accuracy']:.1%}")
print(f"🎯 Exact Match Rate: {eval_results['exact_match_accuracy']:.1%}")
print(f"📈 Top-3 Accuracy: {eval_results['top_3_accuracy']:.1%}")
print(f"🔍 Coverage: {eval_results['coverage_score']:.1%}")
print(f"⚡ F1-Score: {eval_results['multilingual_f1']:.3f}")
print(f"📏 Avg Similarity: {eval_results['average_similarity_score']:.3f}")
        
# Provide interpretation
print("\n=== Model Quality Interpretation ===")
if eval_results['partial_match_accuracy'] >= 0.8:
    print("✅ EXCELLENT: Model performs very well!")
elif eval_results['partial_match_accuracy'] >= 0.6:
    print("✅ GOOD: Model performs reasonably well")
elif eval_results['partial_match_accuracy'] >= 0.4:
    print("⚠️ FAIR: Model needs improvement")
else:
    print("❌ POOR: Model needs significant work")

# Recommendations
print("\n=== Recommendations ===")
if eval_results['coverage_score'] < 0.9:
    print("• Consider expanding vocabulary coverage")
if eval_results['exact_match_accuracy'] < 0.3:
    print("• Multiple meanings handling could be improved")
if eval_results['multilingual_f1'] < 0.5:
    print("• Consider more sophisticated NLP techniques")
else:
    print("Model is performing well. No recommendations provided.")


=== Final Model Performance Summary ===
📊 Overall Accuracy: 100.0%
🎯 Exact Match Rate: 100.0%
📈 Top-3 Accuracy: 100.0%
🔍 Coverage: 100.0%
⚡ F1-Score: 1.000
📏 Avg Similarity: 1.000

=== Model Quality Interpretation ===
✅ EXCELLENT: Model performs very well!

=== Recommendations ===
Model is performing well. No recommendations provided.


**Cross Validating Model**

In [13]:
def cross_validate_model(self, k_folds=5, random_state=42):
        """
        Perform k-fold cross validation
        """
        print(f"\n=== {k_folds}-Fold Cross Validation ===")
        
        all_cree_words = list(self.cree_to_english.keys())
        np.random.seed(random_state)
        np.random.shuffle(all_cree_words)
        
        # Split into k folds
        fold_size = len(all_cree_words) // k_folds
        cv_results = []
        
        for fold in range(k_folds):
            print(f"\nFold {fold + 1}/{k_folds}")
            
            # Define test set for this fold
            start_idx = fold * fold_size
            end_idx = start_idx + fold_size if fold < k_folds - 1 else len(all_cree_words)
            test_words = all_cree_words[start_idx:end_idx]
            
            # Evaluate on this fold
            exact_matches = 0
            partial_matches = 0
            similarity_scores = []
            
            for cree_word in test_words:
                true_meanings = set(self.cree_to_english[cree_word])
                predicted_meanings = self.find_translations(cree_word, top_k=5)
                predicted_set = set(predicted_meanings)
                
                if true_meanings == predicted_set:
                    exact_matches += 1
                
                if len(true_meanings.intersection(predicted_set)) > 0:
                    partial_matches += 1
                
                # Jaccard similarity
                if len(predicted_meanings) > 0:
                    jaccard_sim = len(true_meanings.intersection(predicted_set)) / len(true_meanings.union(predicted_set))
                    similarity_scores.append(jaccard_sim)
                else:
                    similarity_scores.append(0)
            
            fold_results = {
                'fold': fold + 1,
                'exact_accuracy': exact_matches / len(test_words),
                'partial_accuracy': partial_matches / len(test_words),
                'avg_similarity': np.mean(similarity_scores)
            }
            
            cv_results.append(fold_results)
            print(f"  Exact Accuracy: {fold_results['exact_accuracy']:.3f}")
            print(f"  Partial Accuracy: {fold_results['partial_accuracy']:.3f}")
            print(f"  Avg Similarity: {fold_results['avg_similarity']:.3f}")
        
        # Calculate overall CV results
        cv_summary = {
            'mean_exact_accuracy': np.mean([r['exact_accuracy'] for r in cv_results]),
            'std_exact_accuracy': np.std([r['exact_accuracy'] for r in cv_results]),
            'mean_partial_accuracy': np.mean([r['partial_accuracy'] for r in cv_results]),
            'std_partial_accuracy': np.std([r['partial_accuracy'] for r in cv_results]),
            'mean_similarity': np.mean([r['avg_similarity'] for r in cv_results]),
            'std_similarity': np.std([r['avg_similarity'] for r in cv_results])
        }
        
        print(f"\n--- Cross Validation Summary ---")
        print(f"Exact Accuracy: {cv_summary['mean_exact_accuracy']:.3f} ± {cv_summary['std_exact_accuracy']:.3f}")
        print(f"Partial Accuracy: {cv_summary['mean_partial_accuracy']:.3f} ± {cv_summary['std_partial_accuracy']:.3f}")
        print(f"Similarity Score: {cv_summary['mean_similarity']:.3f} ± {cv_summary['std_similarity']:.3f}")
        
        return cv_results, cv_summary

# Attach it to the class
CreeLearningModel.cross_validate_model = cross_validate_model

In [43]:
cv_results, cv_summary = model.cross_validate_model(k_folds=5)


=== 5-Fold Cross Validation ===

Fold 1/5
  Exact Accuracy: 1.000
  Partial Accuracy: 1.000
  Avg Similarity: 1.000

Fold 2/5
  Exact Accuracy: 1.000
  Partial Accuracy: 1.000
  Avg Similarity: 1.000

Fold 3/5
  Exact Accuracy: 1.000
  Partial Accuracy: 1.000
  Avg Similarity: 1.000

Fold 4/5
  Exact Accuracy: 1.000
  Partial Accuracy: 1.000
  Avg Similarity: 1.000

Fold 5/5
  Exact Accuracy: 1.000
  Partial Accuracy: 1.000
  Avg Similarity: 1.000

--- Cross Validation Summary ---
Exact Accuracy: 1.000 ± 0.000
Partial Accuracy: 1.000 ± 0.000
Similarity Score: 1.000 ± 0.000


**Learning Curve Analysis**

In [14]:
def learning_curve_analysis(self):
        """
        Analyze how model performance changes with dataset size
        """
        print("\n=== Learning Curve Analysis ===")
        
        all_words = list(self.cree_to_english.keys())
        dataset_sizes = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
        
        learning_curve_results = []
        
        for size in dataset_sizes:
            # Sample subset of data
            n_words = int(len(all_words) * size)
            sampled_words = np.random.choice(all_words, n_words, replace=False)
            
            # Create temporary model with subset
            temp_cree_to_english = {word: self.cree_to_english[word] for word in sampled_words}
            
            # Test on remaining data
            test_words = [word for word in all_words if word not in sampled_words]
            if len(test_words) == 0:
                continue
            
            # Evaluate
            partial_matches = 0
            for test_word in test_words[:min(50, len(test_words))]:  # Limit for efficiency
                true_meanings = set(self.cree_to_english[test_word])
                predicted_meanings = self.find_translations(test_word, top_k=3)
                predicted_set = set(predicted_meanings)
                
                if len(true_meanings.intersection(predicted_set)) > 0:
                    partial_matches += 1
            
            accuracy = partial_matches / min(50, len(test_words))
            
            learning_curve_results.append({
                'dataset_size': size,
                'n_training_words': n_words,
                'accuracy': accuracy
            })
            
            print(f"Dataset size: {size:.1f} ({n_words} words) -> Accuracy: {accuracy:.3f}")
        
        return learning_curve_results

# Attach it to the class
CreeLearningModel.learning_curve_analysis = learning_curve_analysis

In [44]:
learning_curve_data = model.learning_curve_analysis()


=== Learning Curve Analysis ===
Dataset size: 0.1 (99 words) -> Accuracy: 1.000
Dataset size: 0.2 (199 words) -> Accuracy: 1.000
Dataset size: 0.3 (299 words) -> Accuracy: 1.000
Dataset size: 0.4 (399 words) -> Accuracy: 1.000
Dataset size: 0.5 (499 words) -> Accuracy: 1.000
Dataset size: 0.6 (599 words) -> Accuracy: 1.000
Dataset size: 0.7 (699 words) -> Accuracy: 1.000
Dataset size: 0.8 (799 words) -> Accuracy: 1.000
Dataset size: 0.9 (899 words) -> Accuracy: 1.000


**Model Confidence Score Calculation**

In [15]:
def model_confidence_score(self, cree_word, threshold=0.3):
        """
        Calculate confidence score for a prediction
        """
        predictions = self.find_translations(cree_word, top_k=5)
        
        if not predictions:
            return 0.0
        
        # Calculate confidence based on similarity scores
        unique_cree = list(self.cree_to_english.keys())
        
        if cree_word.lower() in unique_cree:
            # Direct match - high confidence
            return 1.0
        else:
            # Similarity-based match
            query_embedding = self.vectorizer.transform([cree_word.lower()])
            similarities = cosine_similarity(query_embedding, self.cree_embeddings)[0]
            max_similarity = np.max(similarities)
            
            # Normalize confidence score
            confidence = min(max_similarity / threshold, 1.0) if max_similarity > 0 else 0.0
            return confidence
        

# Attach it to the class
CreeLearningModel.model_confidence_score = model_confidence_score

In [48]:
print("\n=== Confidence Scoring Examples ===\n")
confidence_test_words = ['ahēw', 'wāsaskotēnikan', 'ahin']
for word in confidence_test_words:
    confidence = model.model_confidence_score(word)
    translations = model.find_translations(word)
    print(f"'{word}' -> Confidence: {confidence:.3f}, Translations: {translations}")
        


=== Confidence Scoring Examples ===

'ahēw' -> Confidence: 1.000, Translations: ['he placed him']
'wāsaskotēnikan' -> Confidence: 1.000, Translations: ['a lamp', 'lightbulb']
'ahin' -> Confidence: 1.000, Translations: ['put', 'place me']


**Model Statistics**

In [49]:
print("\n=== Model Statistics ===")
print(f"Total Cree words: {len(model.cree_to_english)}")
print(f"Total English meanings: {len(model.english_to_cree)}")
        
multi_meaning = sum(1 for v in model.cree_to_english.values() if len(v) > 1)
print(f"Cree words with multiple meanings: {multi_meaning}")
        
avg_meanings = np.mean([len(v) for v in model.cree_to_english.values()])
print(f"Average meanings per Cree word: {avg_meanings:.2f}")
        


=== Model Statistics ===
Total Cree words: 999
Total English meanings: 1099
Cree words with multiple meanings: 147
Average meanings per Cree word: 1.19
