In [None]:
import re
import nltk
from nltk.metrics import edit_distance
from nltk.corpus import words
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from datetime import datetime
import pandas as pd


nltk.download('words')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

# ***Regular Expressions***


1. time HH:MM pattern in string


In [None]:
def find_time(text):
    pattern = r'\b(?:[01]\d|2[0-3]):[0-5]\d\b'
    return re.findall(pattern, text)

# test:
text = "this is a correct date: 04:59 and this 17:45, but not 24:00 and 23:60."
print(find_time(text))

['04:59', '17:45']


2. handling phone formats, we support below patterns as in text input string


In [None]:
def find_phone_numbers(text):
    pattern1 = r'\b0[1-9]\d{2}[-\s]?\d{3}[-\s]?\d{2}[-\s]?\d{2}\b'
    pattern2 = r'\(\d{1,3}\)[-.\s]?\d{3}[-\s]?\d{2}[-\s]?\d{2}\b'
    pattern3 = r'\b\d{2}[-\s]?\d{7}\b'
    pattern4 = r'\b\d{3}[-\s]?\d{3}[-\s]?\d{4}\b'
    pattern5 = r'\b\d{2}-\d{3}-\d{4}\b'

    return re.findall(pattern1 + '|' + pattern2 + '|' + pattern3 + '|' + pattern4+ '|' + pattern5, text)

# test:
text = "(04) 999-9999,03-8888888, 04-777-7777, 036666666,  0544444444, 054-3333333, (053)2222222, (054)111-11-11, (054) 9999999,  0545321456 ."
print(find_phone_numbers(text))


['(04) 999-9999', '03-8888888', '04-777-7777', '036666666', '0544444444', '054-3333333', '(053)2222222', '(054)111-11-11', '(054) 9999999', '0545321456']


3. comments pattern handling

In [None]:
def find_comments(text):
    pattern = r'/\*.*?\*/'
    return re.findall(pattern, text)

# test:
text = "/* This is a first comment */ Some text comes here /* Another comment */ ...*//**/"
print(find_comments(text))

['/* This is a first comment */', '/* Another comment */', '/**/']


4. extract number description within 30-40




In [None]:
def find_number_descriptions(text):
    pattern = r'\b(thirty(?:-(?:one|two|three|four|five|six|seven|eight|nine))?)(?:\s|$)\b'
    return re.findall(pattern, text)

# Example usage:
text = "I have thirty apples and thirty-one oranges and thirty-blah blah and thirty-nine carrots."
print(find_number_descriptions(text))

['thirty', 'thirty-one', 'thirty-nine']


5. date formats handling

In [None]:
def find_dates(text):
    pattern = r'\b\d{4}-(?:(?:0[13578]|1[02])-(?:0[1-9]|1\d|2[0-9]|3[01])|(?:0[469]|11)-(?:0[1-9]|1\d|30)|02-(?:0[1-9]|1\d|2[0-9]))\b'
    return re.findall(pattern, text)

# test:
text = "this date is valid: 1990-12-25 and this one 2023-06-15, 2024-02-29 and this 2016-10-31, but not 2016-04-31, 2016-11-31, and 2016-13-33."
print(find_dates(text))


['1990-12-25', '2023-06-15', '2024-02-29', '2016-10-31']


#***Similarity between strings***



In [None]:
file_name = "/content/misspellings_and_corrections.txt"
with open(file_name, 'r') as file:
    text = file.read()
print(text[:500])

1.
NIGEL THRUSH page 48 

 I have four in my Family Dad Mum and <ERR targ=sister> siter </ERR> .
My Dad works at Melton.
My <ERR targ=sister> siter </ERR> <ERR targ=goes> go </ERR> to Tonbury.
My Mum goes out <ERR targ=sometimes> some times </ERR> .
I go to Bridgebrook i go out <ERR targ=sometimes> some times </ERR> on Tuesday night i go to Youth <ERR targ=club> clob </ERR> .
On thursday nights I go <ERR targ=bellringing> bell ringing </ERR> on Saturdays I go down to the farm.
on sundays I go to


1. edit distance value for each error-correction word pair

In [None]:
def calculate_edit_distance(incorrect, correction):
    return edit_distance(incorrect, correction)

def percentage_errors_over_distance(text):
    data = []
    # finds all error-correction pairs from the text that matches pattern
    errors = re.findall(r'<ERR targ=(.*?)>(.*?)</ERR>', text, re.DOTALL)
    for targ, words in errors:
        correction = words.split("</ERR>")[-1].strip()
        distance = calculate_edit_distance( correction, targ)
        data.append({'Error': correction, 'Correction': targ, 'Distance': distance})
    df = pd.DataFrame(data)
    return df

errors_df = percentage_errors_over_distance(text)
print(errors_df.head(10))



          Error   Correction  Distance
0         siter       sister         1
1         siter       sister         1
2            go         goes         2
3    some times    sometimes         1
4    some times    sometimes         1
5          clob         club         1
6  bell ringing  bellringing         1
7          wakh        watch         2
8        frount        front         1
9        sexeon       second         3


In [None]:
# results validation
print(edit_distance("sister", "siter"))
print(edit_distance("goes", "go"))
print(edit_distance("sexeon", "second"))

1
2
3


2. error statistics

In [None]:
total_errors = len(errors_df)
print(f"Total errors: {total_errors}")

# percentage of errors with edit distance 1 and 2
errors_distance_one = errors_df[errors_df['Distance'] == 1]
errors_distance_two = errors_df[errors_df['Distance'] == 2]

percentage_one = len(errors_distance_one) / total_errors * 100
percentage_two = len(errors_distance_two) / total_errors * 100

print(f"Percentage of errors with edit distance 1: {percentage_one}%")
print(f"Percentage of errors with edit distance 2: {percentage_two}%")

# Print top 10 errors for edit distance 1 and top 10 errors for edit distance 2
top_10_errors_distance_one = errors_distance_one['Error'].value_counts().head(10)
top_10_errors_distance_two = errors_distance_two['Error'].value_counts().head(10)

print("\nTop 10 errors for edit distance 1 (based on error frequency):")
print(top_10_errors_distance_one)

print("\nTop 10 errors for edit distance 2 (based on error frequency):")
print(top_10_errors_distance_two)

Total errors: 2600
Percentage of errors with edit distance 1: 55.42307692307692%
Percentage of errors with edit distance 2: 29.846153846153843%

Top 10 errors for edit distance 1 (based on error frequency):
to      27
Jame    21
dont    20
two     20
here    15
the     15
go      14
is      14
of      14
its     11
Name: Error, dtype: int64

Top 10 errors for edit distance 2 (based on error frequency):
there    22
their    15
hear     10
no        9
vethn     9
look      8
they      8
farm      6
lack      6
your      6
Name: Error, dtype: int64


3. list of English words with an edit distance=1 form error word

In [None]:
def find_similar_words(error_word):
    english_word_set = set(words.words())
    error_word_lower = error_word.lower()

    # find words with edit distance 1 (error words lower case only)
    similar_words = [word for word in english_word_set if edit_distance(error_word_lower, word) == 1]

    return similar_words

# test:
error_word = "sexeon"
similar_words = find_similar_words(error_word)

print(f"Words with edit distance 1 from '{error_word}':")
print(similar_words)

Words with edit distance 1 from 'sexeon':
['sexton', 'sexern']


4. analysis of the first 50 errors

In [None]:
# generates list of similar words proposals based on find_similar_words
def generate_proposals_df(errors_df, num_errors=50):
    proposals_data = []

    for index, row in errors_df.head(num_errors).iterrows():
        error_word = row['Error']
        correct_word = row['Correction']
        proposals = find_similar_words(error_word)
        print(f"Error: '{error_word}', Proposals: '{proposals}'")
        proposals_data.append({'Error': error_word, 'Correct': correct_word, 'Proposals': proposals})

    # dataFrame from proposals_data
    proposals_df = pd.DataFrame(proposals_data)

    return proposals_df
# similar words for first 50 words (assumption - we ignore duplicates)
proposals_df = generate_proposals_df(errors_df, num_errors=50)

Error: 'siter', Proposals: '['titer', 'siver', 'sizer', 'biter', 'site', 'diter', 'sider', 'sifter', 'skiter', 'sixer', 'sitar', 'citer', 'sinter', 'siper', 'sister', 'sier', 'sitter', 'smiter', 'miter', 'niter', 'iter', 'liter']'
Error: 'siter', Proposals: '['titer', 'siver', 'sizer', 'biter', 'site', 'diter', 'sider', 'sifter', 'skiter', 'sixer', 'sitar', 'citer', 'sinter', 'siper', 'sister', 'sier', 'sitter', 'smiter', 'miter', 'niter', 'iter', 'liter']'
Error: 'go', Proposals: '['geo', 'Po', 'gor', 'goo', 'ago', 'Jo', 'ko', 'Io', 'Mo', 'gos', 'do', 'Ko', 'No', 'Ro', 'got', 'ga', 'ge', 'yo', 'io', 'mo', 'to', 'wo', 'gio', 'goa', 'gob', 'goy', 'gog', 'goi', 'zo', 'god', 'Lo', 'bo', 'g', 'ho', 'Ho', 'gon', 'lo', 'Fo', 'ego', 'gol', 'Ao', 'so', 'o', 'jo', 'no', 'po']'
Error: 'some times', Proposals: '['sometimes']'
Error: 'some times', Proposals: '['sometimes']'
Error: 'clob', Proposals: '['flob', 'lob', 'chob', 'clomb', 'club', 'clod', 'cob', 'cloy', 'clop', 'clog', 'slob', 'blob', 'c

In [None]:
# average number of proposals
average_proposals = proposals_df['Proposals'].apply(len).mean()
print(f"Average number of proposals: {average_proposals}")

def calculate_percentage_correct_in_proposals(proposals_df):
    total_cases = len(proposals_df)
    correct_in_proposals = sum(proposals_df['Correct'].isin(proposals_df['Proposals']))
    percentage = (correct_in_proposals / total_cases) * 100
    return percentage

# calculate percentage of correct words on proposals
percentage_correct = calculate_percentage_correct_in_proposals(proposals_df)
print(f"Percentage of cases where the correct word is among the proposals list: {percentage_correct:.2f}%")



Average number of proposals: 14.86
Percentage of cases where the correct word is among the proposals list: 0.00%


In [None]:
def check_cases_correct_not_in_proposals(proposals_df):
    incorrect_cases = proposals_df[~proposals_df['Correct'].isin(proposals_df['Proposals'])]

    print("Cases where correct word isn't among the proposals:")
    print(incorrect_cases)

# Assuming you have the proposals_df DataFrame
check_cases_correct_not_in_proposals(proposals_df)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import accuracy_score
import numpy as np

# A. Prepare the training data
train_data = fetch_20newsgroups(subset='train')
train_documents = train_data.data
train_labels = train_data.target

# B. Prepare the test data
test_data = fetch_20newsgroups(subset='test')
test_documents = test_data.data
test_labels = test_data.target

# C. Choose representation methods (TF and TF-IDF) and similarity measures (dot-product and cosine similarity)
# C.1 TF representation
tf_vectorizer = CountVectorizer()
train_tf_matrix = tf_vectorizer.fit_transform(train_documents)
test_tf_matrix = tf_vectorizer.transform(test_documents)

# C.2 TF-IDF representation
tfidf_vectorizer = TfidfVectorizer()
train_tfidf_matrix = tfidf_vectorizer.fit_transform(train_documents)
test_tfidf_matrix = tfidf_vectorizer.transform(test_documents)

# D. Define a function to predict using KNN
def knn_predict(train_matrix, test_matrix, k, similarity_measure):
    predictions = []

    for test_doc in test_matrix:
        if similarity_measure == 'dot_product':
            similarities = np.dot(train_matrix, test_doc.T)
        elif similarity_measure == 'cosine_similarity':
            similarities = pairwise_distances(train_matrix, test_doc, metric='cosine').flatten()

        # Find the indices of the k most similar training examples
        nearest_indices = np.argsort(similarities)[:k]
        # Predict the label based on the majority class among the k nearest neighbors
        predicted_label = np.argmax(np.bincount(train_labels[nearest_indices]))
        predictions.append(predicted_label)

    return predictions

# E. Evaluate the model for each combination of representation method and similarity measure
methods = [('TF', train_tf_matrix, test_tf_matrix), ('TF-IDF', train_tfidf_matrix, test_tfidf_matrix)]
similarity_measures = ['dot_product', 'cosine_similarity']

for method_name, train_matrix, test_matrix in methods:
    for similarity_measure in similarity_measures:
        k = 1  # 1-NN
        predictions = knn_predict(train_matrix, test_matrix, k, similarity_measure)

        # Calculate accuracy and print the results
        accuracy = accuracy_score(test_labels, predictions)
        print(f'{method_name} representation with {similarity_measure}: Accuracy = {accuracy:.4f}')