# project 7 : Autocomplete and Autocorrect Data Analytics

# Description:Explore the efficiency and accuracy of autocomplete and autocorrect algorithms in naturallanguage processing (NLP) through this data analytics project. The objective is to enhance userexperience and text prediction by analyzing large datasets and implementing or optimizingautocomplete and autocorrect functionalities

# Key Concepts and Challenges:
    
Dataset Collection: Gather 1. diverse text data.
    
2.NLP Preprocessing: Clean and prepare data for analysis.
    
3.Autocomplete: Implement algorithms for word/phrase predictions.
    
4.Autocorrect: Optimize algorithms for spelling error correction.
    
5.Metrics: Define and measure performance metrics.
    
6.User Experience: Assess impact through feedback and surveys.
    
7.Algorithm Comparison: Evaluate different models for efficiency and accuracy.
    
8.Visualization: Use tools for data visualization

In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download()

  from pandas.core import (


showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [2]:
data = pd.read_csv("creditcard.csv")

In [3]:
data.head(5)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove special characters, numbers, and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    
    # Join tokens back into a single string
    preprocessed_text = ' '.join(lemmatized_tokens)
    
    return preprocessed_text

# Example usage:
text = "This is an example sentence for NLP preprocessing."
clean_text = preprocess_text(text)
print("Original text:", text)
print("Preprocessed text:", clean_text)


Original text: This is an example sentence for NLP preprocessing.
Preprocessed text: example sentence nlp preprocessing


In [5]:
from collections import defaultdict
from nltk.tokenize import word_tokenize
from nltk.corpus import reuters
from nltk.util import ngrams
import random

class Autocomplete:
    def __init__(self, n):
        self.n = n  # Specify the order of the n-gram model
        self.ngrams = defaultdict(list)

    def train(self, text):
        tokens = word_tokenize(text)
        for ngram in ngrams(tokens, self.n):
            prefix = ' '.join(ngram[:-1])
            self.ngrams[prefix].append(ngram[-1])

    def predict(self, prefix, num_predictions=3):
        if prefix not in self.ngrams:
            return None
        return random.choices(self.ngrams[prefix], k=num_predictions)

# Example usage:
text = reuters.raw()  # Using the Reuters corpus for training
autocomplete = Autocomplete(n=2)  # Using a bigram model
autocomplete.train(text)

prefix = 'economic growth'  # Example prefix for prediction
predictions = autocomplete.predict(prefix)
print("Autocomplete predictions for prefix '{}':".format(prefix))
print(predictions)


Autocomplete predictions for prefix 'economic growth':
None


In [6]:
from collections import Counter
import re

class Autocorrect:
    def __init__(self, vocabulary):
        self.vocabulary = Counter(vocabulary)

    def suggest_correction(self, word):
        candidates = self.generate_candidates(word)
        return max(candidates, key=self.vocabulary.get)

    def generate_candidates(self, word):
        letters = 'abcdefghijklmnopqrstuvwxyz'
        splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
        deletes = [left + right[1:] for left, right in splits if right]
        transposes = [left + right[1] + right[0] + right[2:] for left, right in splits if len(right) > 1]
        replaces = [left + letter + right[1:] for left, right in splits if right for letter in letters]
        inserts = [left + letter + right for left, right in splits for letter in letters]
        return set(deletes + transposes + replaces + inserts)

# Example usage:
vocabulary = ["apple", "banana", "orange", "pear", "peach"]
autocorrect = Autocorrect(vocabulary)

word = "oragne"  # Example misspelled word
correction = autocorrect.suggest_correction(word)
print("Autocorrect suggestion for '{}' is '{}'.".format(word, correction))


TypeError: '>' not supported between instances of 'NoneType' and 'NoneType'

In [None]:
class Metrics:
    def __init__(self, true_positives, false_positives, false_negatives):
        self.true_positives = true_positives
        self.false_positives = false_positives
        self.false_negatives = false_negatives
    
    def accuracy(self):
        total_predictions = self.true_positives + self.false_positives + self.false_negatives
        return self.true_positives / total_predictions
    
    def precision(self):
        return self.true_positives / (self.true_positives + self.false_positives)
    
    def recall(self):
        return self.true_positives / (self.true_positives + self.false_negatives)
    
    def f1_score(self):
        precision = self.precision()
        recall = self.recall()
        return 2 * (precision * recall) / (precision + recall)

# Example usage:
true_positives = 80
false_positives = 10
false_negatives = 5

metrics = Metrics(true_positives, false_positives, false_negatives)
print("Accuracy:", metrics.accuracy())
print("Precision:", metrics.precision())
print("Recall:", metrics.recall())
print("F1 Score:", metrics.f1_score())


In [None]:
from survey import Survey

# Define the survey questions
survey = Survey({
    "1. How satisfied are you with the autocomplete feature?": ["Very satisfied", "Satisfied", "Neutral", "Dissatisfied", "Very dissatisfied"],
    "2. How accurate do you find the autocomplete suggestions?": ["Very accurate", "Accurate", "Neutral", "Inaccurate", "Very inaccurate"],
    "3. How easy is it to use the autocorrect feature?": ["Very easy", "Easy", "Neutral", "Difficult", "Very difficult"],
    "4. Any additional comments or suggestions?": ""
})

# Collect responses
responses = survey.ask()

# Print responses
print("Survey Responses:")
for question, answer in responses.items():
    print(question + ":", answer)


In [None]:
from nltk.util import ngrams
from collections import defaultdict
import time

class NGramAutocomplete:
    def __init__(self, n):
        self.n = n
        self.ngram_counts = defaultdict(int)

    def train(self, corpus):
        for sentence in corpus:
            tokens = sentence.split()
            for ngram in ngrams(tokens, self.n):
                self.ngram_counts[ngram] += 1

    def predict(self, prefix):
        predictions = []
        for ngram, count in self.ngram_counts.items():
            if ngram[:len(prefix)] == tuple(prefix.split()):
                predictions.append((ngram[-1], count))
        predictions.sort(key=lambda x: x[1], reverse=True)
        return [prediction[0] for prediction in predictions]

class TrieAutocomplete:
    def __init__(self):
        self.trie = defaultdict(dict)

    def insert(self, word):
        node = self.trie
        for char in word:
            if char not in node:
                node[char] = {}
            node = node[char]
        node['$'] = True

    def search(self, prefix):
        node = self.trie
        for char in prefix:
            if char not in node:
                return []
            node = node[char]
        return self._dfs(node, prefix)

    def _dfs(self, node, prefix):
        if '$' in node:
            return [prefix]
        results = []
        for char, child_node in node.items():
            if char != '$':
                results.extend(self._dfs(child_node, prefix + char))
        return results

# Example usage:
corpus = ["the quick brown fox jumps over the lazy dog", "the quick brown cat jumps over the lazy dog"]
prefix = "the quick"
n = 2

# Instantiate and train NGramAutocomplete
ngram_autocomplete = NGramAutocomplete(n)
ngram_autocomplete.train(corpus)

# Instantiate TrieAutocomplete
trie_autocomplete = TrieAutocomplete()
for sentence in corpus:
    for word in sentence.split():
        trie_autocomplete.insert(word)

# Test NGramAutocomplete
start_time = time.time()
predictions_ngram = ngram_autocomplete.predict(prefix)
print("NGramAutocomplete predictions:", predictions_ngram)
print("Time taken for NGramAutocomplete:", time.time() - start_time)

# Test TrieAutocomplete
start_time = time.time()
predictions_trie = trie_autocomplete.search(prefix)
print("TrieAutocomplete predictions:", predictions_trie)
print("Time taken for TrieAutocomplete:", time.time() - start_time)


In [None]:
import matplotlib.pyplot as plt

# Define the algorithms and their performance metrics
algorithms = ['Algorithm 1', 'Algorithm 2', 'Algorithm 3']
accuracy = [0.85, 0.90, 0.88]
precision = [0.82, 0.87, 0.85]
recall = [0.88, 0.92, 0.90]
f1_score = [0.85, 0.89, 0.87]

# Create subplots for each metric
fig, axs = plt.subplots(2, 2, figsize=(10, 8))

# Plot accuracy
axs[0, 0].bar(algorithms, accuracy, color='blue')
axs[0, 0].set_title('Accuracy')
axs[0, 0].set_ylabel('Score')

# Plot precision
axs[0, 1].bar(algorithms, precision, color='green')
axs[0, 1].set_title('Precision')
axs[0, 1].set_ylabel('Score')

# Plot recall
axs[1, 0].bar(algorithms, recall, color='orange')
axs[1, 0].set_title('Recall')
axs[1, 0].set_ylabel('Score')

# Plot F1 score
axs[1, 1].bar(algorithms, f1_score, color='red')
axs[1, 1].set_title('F1 Score')
ax

