<a href="https://colab.research.google.com/github/Gargi2305/Bigram-Word-Classification/blob/main/word%20classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

class WordBigramModel:
    def __init__(self):
        self.vectorizer = None
        self.classifier = None
        self.words = None

    def fit(self, words):
        self.words = words
        self.vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 2))
        X = self.vectorizer.fit_transform(words)

        # Find the minimum class size
        unique, counts = np.unique(words, return_counts=True)
        min_class_size = min(counts)

        # Ensure n_splits does not exceed min_class_size and is at least 2
        if min_class_size >= 2:
            param_grid = {'max_depth': [10, 20, 50, 100, None], 'min_samples_leaf': [1, 2, 5, 10]}
            grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=min(5, min_class_size))
            grid_search.fit(X, words)
            self.classifier = grid_search.best_estimator_
        else:
            self.classifier = DecisionTreeClassifier(max_depth=400, min_samples_leaf=5)
            self.classifier.fit(X, words)

    def predict(self, bigram_list, test_word):
        match_scores = []

        for word in self.words:
            score = self.calculate_match_score(bigram_list, word)
            if score > 0:  # Only consider words with a positive match score
                match_scores.append((word, score))

        # Sort by score in descending order, then by word length for closer matches, and alphabetically for stability
        match_scores.sort(key=lambda x: (-x[1], len(x[0]), x[0]))

        # Determine the threshold score by considering a margin below the highest score
        if match_scores:
            highest_score = match_scores[0][1]
            threshold_score = highest_score * 0.7  # Example: Use 70% of the highest score as threshold
        else:
            threshold_score = 0

        # Select words with scores above the threshold, limiting to a maximum of 5 guesses
        best_matches = [word for word, score in match_scores if score >= threshold_score][:5]

        # Ensure all words with the same top score are considered as potential correct predictions
        top_score_words = [word for word, score in match_scores if score == highest_score]

        # Calculate precision score (not used in main function)
        precision_score = 1 / len(top_score_words) if test_word in top_score_words else 0

        return best_matches[:max(2, len(best_matches))]

    def calculate_match_score(self, bigram_list, word):
        score = 0
        word_bigrams = [word[i:i+2] for i in range(len(word)-1)]  # Generate bigrams from the word
        bigram_sequence_score = sum(1 for i, bigram in enumerate(bigram_list) if bigram in word_bigrams[i:i+1])
        if bigram_sequence_score == len(bigram_list):
            score += bigram_sequence_score * 2  # Double score for complete sequence match
        else:
            score += sum(word.count(bigram) for bigram in bigram_list)  # Fallback to count-based scoring
        return score

def my_fit(words):
    model = WordBigramModel()
    model.fit(words)
    return model

def my_predict(model, bigram_list):
    return model.predict(bigram_list, "")  # Pass an empty string or any placeholder as test_word

