In [None]:
import re
import requests
from collections import Counter
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from IPython.display import display, HTML
import json

### Dataset Collection

In [None]:
url = "https://www.gutenberg.org/files/1661/1661-0.txt"
try:
    response = requests.get(url)
    response.raise_for_status()  # Raise an exception for bad status codes
    text_data = response.text
    print("Successfully downloaded 'The Adventures of Sherlock Holmes'.")
except requests.exceptions.RequestException as e:
    print(f"Error downloading dataset: {e}")
    text_data = "" # Ensure text_data is defined even if download fails

Successfully downloaded 'The Adventures of Sherlock Holmes'.


### NLP Preprocessing

In [None]:
# We clean the text and build a word frequency dictionary.

def preprocess(text):
    """
    Cleans and tokenizes text.
    - Converts to lowercase
    - Finds all word sequences
    - Returns a list of words
    """
    text = text.lower()
    words = re.findall(r'\w+', text)
    return words

if text_data:
    words = preprocess(text_data)
    word_counts = Counter(words)
    total_words = len(words)

    # Our vocabulary is the set of unique words
    vocab = set(words)

    print(f"Total words in corpus: {total_words}")
    print(f"Unique words (vocabulary size): {len(word_counts)}")
else:
    print("\nSkipping further steps as dataset download failed.")
    # Exit if we can't get the data
    exit()


Total words in corpus: 109119
Unique words (vocabulary size): 8183


### Initial Visualization

In [None]:
# Visualize the frequency of the most common words.

top_words = word_counts.most_common(20)
df_top_words = pd.DataFrame(top_words, columns=['Word', 'Frequency'])

fig = px.bar(df_top_words, x='Word', y='Frequency',
             title='Top 20 Most Common Words',
             color='Word')
fig.show()

### Algorithm Comparison (Autocomplete)

In [None]:
# ALGORITHM 1: Frequency-Based Autocomplete (More Advanced)
def get_autocomplete_suggestions_v1(prefix, word_counts, max_suggestions=5):
    suggestions = [word for word in word_counts if word.startswith(prefix)]
    sorted_suggestions = sorted(suggestions, key=lambda x: word_counts[x], reverse=True)
    return sorted_suggestions[:max_suggestions]

# ALGORITHM 2: Simple Alphabetical Autocomplete (Basic)
def get_autocomplete_suggestions_v2(prefix, vocab, max_suggestions=5):
    suggestions = [word for word in vocab if word.startswith(prefix)]
    return sorted(suggestions)[:max_suggestions]

test_prefix = "adven"
suggestions_v1 = get_autocomplete_suggestions_v1(test_prefix, word_counts)
suggestions_v2 = get_autocomplete_suggestions_v2(test_prefix, vocab)
print(f"Suggestions for '{test_prefix}' (Advanced - Frequency Based): {suggestions_v1}")
print(f"Suggestions for '{test_prefix}' (Basic - Alphabetical): {suggestions_v2}")



Suggestions for 'adven' (Advanced - Frequency Based): ['adventure', 'adventures', 'adventuress']
Suggestions for 'adven' (Basic - Alphabetical): ['adventure', 'adventures', 'adventuress']


### Algorithm Comparison (Autocorrect)

In [None]:
# ALGORITHM 1: Edit Distance + Frequency (More Advanced)
def get_autocorrect_suggestion_v1(word, word_counts, vocab):
    if word in vocab: return word
    def edits1(word):
        letters    = 'abcdefghijklmnopqrstuvwxyz'
        splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
        deletes    = [L + R[1:]               for L, R in splits if R]
        transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
        replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
        inserts    = [L + c + R               for L, R in splits for c in letters]
        return set(deletes + transposes + replaces + inserts)
    def edits2(word): return (e2 for e1 in edits1(word) for e2 in edits1(e1))
    def known(words): return set(w for w in words if w in vocab)
    candidates = known(edits1(word)) or known(edits2(word)) or [word]
    return max(candidates, key=word_counts.get)

# ALGORITHM 2: Simple 1-Edit Distance (Basic)
def get_autocorrect_suggestion_v2(word, vocab):
    if word in vocab: return word
    def edits1(word):
        letters    = 'abcdefghijklmnopqrstuvwxyz'
        splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
        deletes    = [L + R[1:]               for L, R in splits if R]
        transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
        replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
        inserts    = [L + c + R               for L, R in splits for c in letters]
        return set(deletes + transposes + replaces + inserts)
    def known(words): return set(w for w in words if w in vocab)
    candidates = known(edits1(word))
    if candidates:
        return list(candidates)[0] # Return the first found candidate
    return word # Return original if no 1-edit correction is found

test_word = "wason" # Misspelling of 'watson'
correction_v1 = get_autocorrect_suggestion_v1(test_word, word_counts, vocab)
correction_v2 = get_autocorrect_suggestion_v2(test_word, vocab)
print(f"Correction for '{test_word}' (Advanced): {correction_v1}")
print(f"Correction for '{test_word}' (Basic): {correction_v2}")


Correction for 'wason' (Advanced): watson
Correction for 'wason' (Basic): watson


### Metrics - Measuring Performance

In [None]:
# Create a sample test set of (misspelled, correct) pairs
autocorrect_test_set = [
    ('holms', 'holmes'),
    ('wason', 'watson'),
    ('advnture', 'adventure'),
    ('ther', 'there'),
    ('hapy', 'happy'),
    ('mistery', 'mystery'),
    ('wher', 'where')
]

def calculate_accuracy(test_data, correction_function, **kwargs):
    correct_predictions = 0
    for misspelled, correct in test_data:
        prediction = correction_function(misspelled, **kwargs)
        if prediction == correct:
            correct_predictions += 1
    return (correct_predictions / len(test_data)) * 100

# Calculate accuracy for both algorithms
accuracy_v1 = calculate_accuracy(autocorrect_test_set, get_autocorrect_suggestion_v1, word_counts=word_counts, vocab=vocab)
accuracy_v2 = calculate_accuracy(autocorrect_test_set, get_autocorrect_suggestion_v2, vocab=vocab)

print(f"Accuracy of Advanced Autocorrect (v1): {accuracy_v1:.2f}%")
print(f"Accuracy of Basic Autocorrect (v2): {accuracy_v2:.2f}%")


Accuracy of Advanced Autocorrect (v1): 71.43%
Accuracy of Basic Autocorrect (v2): 57.14%


### Visualization of Metrics

In [None]:
metrics_df = pd.DataFrame({
    'Algorithm': ['Advanced (Edit Dist + Freq)', 'Basic (1-Edit Dist)'],
    'Accuracy': [accuracy_v1, accuracy_v2]
})

fig_metrics = px.bar(metrics_df, x='Algorithm', y='Accuracy',
                     title='Autocorrect Algorithm Accuracy Comparison',
                     color='Algorithm',
                     color_discrete_sequence=['#1f77b4', '#ff7f0e'],
                     text_auto='.2f%')
fig_metrics.update_layout(yaxis_title="Accuracy (%)", yaxis_range=[0,100])
fig_metrics.show()


### Interactive User Experience Simulation

In [None]:
# Convert Python functions to be callable from Javascript
from google.colab import output

def get_suggestions_js(prefix):
  v1_suggs = get_autocomplete_suggestions_v1(prefix, word_counts)
  v2_suggs = get_autocomplete_suggestions_v2(prefix, vocab)
  # Return data as a JSON string
  return json.dumps({'v1': v1_suggs, 'v2': v2_suggs})

def get_correction_js(word):
  v1_corr = get_autocorrect_suggestion_v1(word, word_counts, vocab)
  v2_corr = get_autocorrect_suggestion_v2(word, vocab)
  # Return data as a JSON string
  return json.dumps({'v1': v1_corr, 'v2': v2_corr})

output.register_callback('get_suggestions_js', get_suggestions_js)
output.register_callback('get_correction_js', get_correction_js)

html_code = """
<div style="font-family: sans-serif; padding: 20px; border: 1px solid #ccc; border-radius: 10px; background-color: #f9f9f9;">
    <h2 style="text-align: center; color: #333;">Test the Algorithms</h2>

    <!-- Autocomplete Section -->
    <div style="margin-bottom: 25px;">
        <h3 style="color: #444;">Autocomplete Test</h3>
        <p style="font-size: 14px; color: #666;">Type a prefix (e.g., 'adven') and see the suggestions from both algorithms.</p>
        <input type="text" id="autocomplete-input" placeholder="Type a prefix..." style="width: 70%; padding: 8px; border-radius: 5px; border: 1px solid #ccc;">
        <button onclick="handleAutocomplete()" style="padding: 8px 12px; border-radius: 5px; border: none; background-color: #1f77b4; color: white; cursor: pointer;">Get Suggestions</button>
        <div style="margin-top: 10px;">
            <p><strong>Advanced (Frequency):</strong> <span id="autocomplete-v1-results" style="font-family: monospace; color: #1f77b4;"></span></p>
            <p><strong>Basic (Alphabetical):</strong> <span id="autocomplete-v2-results" style="font-family: monospace; color: #ff7f0e;"></span></p>
        </div>
    </div>

    <!-- Autocorrect Section -->
    <div>
        <h3 style="color: #444;">Autocorrect Test</h3>
        <p style="font-size: 14px; color: #666;">Type a misspelled word (e.g., 'wason') and see the corrections.</p>
        <input type="text" id="autocorrect-input" placeholder="Type a misspelled word..." style="width: 70%; padding: 8px; border-radius: 5px; border: 1px solid #ccc;">
        <button onclick="handleAutocorrect()" style="padding: 8px 12px; border-radius: 5px; border: none; background-color: #2ca02c; color: white; cursor: pointer;">Get Correction</button>
        <div style="margin-top: 10px;">
            <p><strong>Advanced (Edit Dist + Freq):</strong> <span id="autocorrect-v1-results" style="font-family: monospace; color: #2ca02c;"></span></p>
            <p><strong>Basic (1-Edit Dist):</strong> <span id="autocorrect-v2-results" style="font-family: monospace; color: #d62728;"></span></p>
        </div>
    </div>
</div>


<script>
  function handleAutocomplete() {
    const prefix = document.getElementById('autocomplete-input').value;
    google.colab.kernel.invokeFunction('get_suggestions_js', [prefix], {})
      .then(result => {
        // The result from Python is a JSON string in result.data['text/plain']
        const data = JSON.parse(result.data['text/plain'].slice(1, -1)); // slice to remove quotes
        document.getElementById('autocomplete-v1-results').innerText = 'Suggestions: ' + JSON.stringify(data.v1);
        document.getElementById('autocomplete-v2-results').innerText = 'Suggestions: ' + JSON.stringify(data.v2);
      })
      .catch(error => console.error(error));
  }

  function handleAutocorrect() {
    const word = document.getElementById('autocorrect-input').value;
    google.colab.kernel.invokeFunction('get_correction_js', [word], {})
      .then(result => {
        const data = JSON.parse(result.data['text/plain'].slice(1, -1));
        document.getElementById('autocorrect-v1-results').innerText = 'Correction: ' + data.v1;
        document.getElementById('autocorrect-v2-results').innerText = 'Correction: ' + data.v2;
      })
      .catch(error => console.error(error));
  }
</script>
"""
display(HTML(html_code))