In [1]:
import re
import pandas as pd
import nltk
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
import tkinter as tk
from tkinter import messagebox
from sklearn.model_selection import train_test_split

In [2]:
# 1. Dataset Loading
def load_dataset():
    # Load WordNet dataset from NLTK
    dataset = []
    for synset in list(wn.all_synsets()):
        word = synset.name().split('.')[0]
        sense = synset.name().split('.')[1]
        meaning = synset.definition()
        dataset.append({'Word': word, 'Sense': sense, 'Meaning': meaning})
    dataset = pd.DataFrame(dataset)
    return dataset

In [3]:
# 2. Preprocessing Functions
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z]', ' ', text)

    # Tokenize the text into individual words
    words = nltk.word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(words)

In [4]:
# 3. Feature Extraction Functions
def extract_features(word):
    # Example feature extraction: Part-of-speech tagging
    pos_tags = pos_tag([word])
    features = {'POS': pos_tags[0][1]}
    return features

In [5]:
# 4. Regular Expression Design
def create_regex_pattern(word):
    # Create a regular expression pattern for the word
    pattern = rf"\b{word}\b"
    return pattern

In [6]:
# 5. Rule-Based Disambiguation
def rule_based_disambiguate_word(word, dataset, features):
    # Iterate through the dataset and check for word matches
    for index, row in dataset.iterrows():
        if re.search(row['Word'], word):
            # Check if all the features match
            if all(feature in row for feature in features):
                return row['Sense']

    # If no match is found, return None
    return None

In [7]:
# 6. Model Training
def train_model(dataset):
    # Preprocess the dataset
    dataset['Preprocessed'] = dataset['Meaning'].apply(preprocess_text)

    # Extract features using TF-IDF vectorization
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(dataset['Preprocessed'])
    y = dataset['Sense']

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train the LinearSVC model
    model = LinearSVC()
    model.fit(X_train, y_train)

    # Calculate accuracy on the testing set
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model accuracy: {accuracy}")

    return model, vectorizer

In [8]:
# 7. Model Prediction
def predict_sense(text, dataset, model, vectorizer):
    preprocessed_text = preprocess_text(text)
    words = preprocessed_text.split()
    word_senses = {}
    ambiguous_words = {}

    for word in words:
        features = extract_features(word)
        sense = rule_based_disambiguate_word(word, dataset, features)
        if sense is None:
            X = vectorizer.transform([preprocessed_text])
            predicted_sense = model.predict(X)
            sense = predicted_sense[0]

        if sense is not None:
            if word in ambiguous_words:
                ambiguous_words[word]['count'] += 1
            else:
                ambiguous_words[word] = {'count': 1, 'meanings': []}
            word_senses[word] = sense
            word_meaning = dataset.loc[(dataset['Word'] == word) & (dataset['Sense'] == sense), 'Meaning'].values
            if len(word_meaning) > 0:
                ambiguous_words[word]['meanings'].append(word_meaning[0])
            else:
                ambiguous_words[word]['meanings'].append('Meaning not found')

    return word_senses, ambiguous_words

In [9]:
# 8. User Interface Functions
def disambiguate():
    global input_text, dataset, model, vectorizer

    text = input_text.get("1.0", "end").strip()
    if text:
        word_senses, ambiguous_words = predict_sense(text, dataset, model, vectorizer)
        ambiguous_words_count = len(ambiguous_words)
        meanings = []

        # Display ambiguous word senses count
        messagebox.showinfo("Ambiguous Words Count", f"Ambiguous words count: {ambiguous_words_count}")

        # Display ambiguous word senses and their meanings
        if ambiguous_words_count > 0:
            for word in ambiguous_words:
                meanings.append(f"{word} : {', '.join(ambiguous_words[word]['meanings'])}")

        if meanings:
            messagebox.showinfo("Word Senses", "\n".join(meanings))
        else:
            messagebox.showinfo("Word Senses", "No ambiguous words found")


In [10]:
def measure_performance(dataset, model, vectorizer):
    # Prepare the test data
    test_data = dataset.sample(frac=0.2, random_state=42)  # Use 20% of the dataset for testing
    test_X = vectorizer.transform(test_data['Preprocessed'])
    test_y = test_data['Sense']

    # Predict the senses
    predicted_y = model.predict(test_X)

    # Calculate and display the classification report
    report = classification_report(test_y, predicted_y)
    print("Classification Report:")
    print(report)

    # Calculate and display the accuracy
    accuracy = accuracy_score(test_y, predicted_y)
    print(f"Accuracy: {accuracy}")


def main():
    global input_text, dataset, model, vectorizer

    # Load the dataset
    dataset = load_dataset()

    # Train the machine learning model
    model, vectorizer = train_model(dataset)

    # Measure the performance
    measure_performance(dataset, model, vectorizer)

    # Create the UI
    root = tk.Tk()
    root.title("Word Sense Disambiguation")
    root.geometry("400x300")

    label = tk.Label(root, text="Enter a sentence:")
    label.pack()

    input_text = tk.Text(root, height=5)
    input_text.pack()

    button = tk.Button(root, text="Disambiguate", command=disambiguate)
    button.pack()

    root.mainloop()

if __name__ == "__main__":
    main()


Model accuracy: 0.838517763046065


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report:
               precision    recall  f1-score   support

                    0.00      0.00      0.00         2
-bruno's-lily       0.00      0.00      0.00         1
   22_caliber       0.00      0.00      0.00         1
   45_caliber       0.00      0.00      0.00         0
 _elias_range       0.00      0.00      0.00         1
  _petersburg       0.00      0.00      0.00         2
            a       0.73      0.48      0.58      1500
            k       0.00      0.00      0.00         1
            n       0.88      0.96      0.92     16376
            o       0.00      0.00      0.00         1
            r       0.77      0.58      0.66       744
            s       0.56      0.43      0.49      2183
            v       0.80      0.72      0.76      2720

     accuracy                           0.84     23532
    macro avg       0.29      0.24      0.26     23532
 weighted avg       0.83      0.84      0.83     23532

Accuracy: 0.838517763046065
