In [19]:
import numpy as np
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
import math

#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('averaged_perceptron_tagger')

text = "The quick brown foxes jumped over the lazy dogs, running and running faster than they ever had before!"


corpus1 = [
    "Data science is an interdisciplinary field that uses scientific methods, processes, algorithms and systems to extract knowledge and insights from data.",
    "Machine learning is an application of artificial intelligence that provides systems the ability to automatically learn and improve from experience.",
    "Deep learning is a subset of machine learning that uses neural networks to analyze various factors of data.",
    "Artificial intelligence is the simulation of human intelligence in machines that are programmed to think like humans and mimic their actions.",
    "Natural language processing is a branch of artificial intelligence that helps computers understand, interpret, and generate human language.",
    "Big data analytics involves examining large data sets to uncover hidden patterns, correlations, and other insights.",
    "Data mining is the process of discovering patterns in large data sets using methods that blend machine learning, statistics, and database systems.",
    "Computer vision is an interdisciplinary field that trains computers to interpret and understand the visual world.",
    "Predictive modeling is a process that uses statistical techniques and machine learning algorithms to predict future outcomes based on historical data.",
    "Robotics is a branch of technology that involves the design, construction, and operation of robots, which are automated machines."
]


In [2]:
#Phase 1 Text Preprocessing
def tokenize_str(input_str):
    input_str = input_str.strip()
    #print(input_str)
    tokens_array = []
    token = ""
    for char in input_str :
        #print(char)
        if char.isspace() : 
            #print("appending new token to tokens array")
            if token:                
                tokens_array.append(token)
                token = ""
        else :
            #print("appending char to token")
            token += char
            #print(token)
    if token :
        tokens_array.append(token)
    return tokens_array

#print(tokenize_str(text))

In [3]:
def remove_special_chars(input_str):
    cleaned_str = []
    for char in input_str :
        if char.isalnum() or char.isspace():
            cleaned_str.append(char)
    return "".join(cleaned_str)

In [4]:
def remove_stopwords(input_tokens):
    stop_words = set(stopwords.words('english')).union(set(stopwords.words('french')))
    #print(stop_words)
    return [token for token in input_tokens if token not in stop_words]
tokens = tokenize_str(remove_special_chars(text))
#print(tokens)
processed_tokens = remove_stopwords(tokens)
print(processed_tokens)

['The', 'quick', 'brown', 'foxes', 'jumped', 'lazy', 'dogs', 'running', 'running', 'faster', 'ever']


In [5]:
def stem_tokens(input_tokens):
    ps = PorterStemmer()
    return [ps.stem(token) for token in input_tokens]

In [6]:
def preprocess(input_str):
    input_str=input_str.strip().lower()
    return stem_tokens(remove_stopwords(tokenize_str(remove_special_chars(input_str))))
processed_tokens = preprocess(text)
print("Input text :\n" ,text)
print("Output tokens :\n", processed_tokens)

Input text :
 The quick brown foxes jumped over the lazy dogs, running and running faster than they ever had before!
Output tokens :
 ['quick', 'brown', 'fox', 'jump', 'lazi', 'dog', 'run', 'run', 'faster', 'ever']


In [7]:
# Phase 2 vectorization
def compute_tf(tokens):
    tf = {}
    for i in tokens:
        count = 0
        for j in tokens:
            if i == j:
                count += 1
        # Check if token is not already in the dictionary
        if i not in tf:
            tf[i] = count / len(tokens)
    return tf

In [8]:
def compute_df(token,corpus):
    count = 0
    for doc in corpus :
        if token in doc :
            count+=1
    return count

In [9]:
def compute_idf(token, corpus):
    return math.log(len(corpus) / (1 + compute_df(token, corpus)))

In [10]:
def tf_idf(tokens, corpus):
    tf_idf_scores = {}
    unique_tokens = set(tokens)
    term_frequencies = compute_tf(tokens)
    for token in unique_tokens:
        tf_idf_scores[token] = term_frequencies[token] * compute_idf(token, corpus)
    return dict(sorted(tf_idf_scores.items()))


In [11]:
#Phase3, Compute similarity (cosine methode)

def cosine_similarity(vector_a,vector_b):
    set_keys = sorted(set(vector_a.keys()).union(vector_b.keys()))
    #print(len(set_keys))
    #print(set_keys)
    vals_a = [vector_a.get(key,0) for key in set_keys]
    vals_b = [vector_b.get(key, 0) for key in set_keys] #aacount for unified keys(volcabularies)

    dot_product = sum(a*b for a,b in zip(vals_a,vals_b))
    magn_a = math.sqrt(sum(a*a for a in vals_a))
    magn_b = math.sqrt(sum(b*b for b in vals_b))

    if magn_a == 0 or magn_b == 0:
        return 0.0

    return dot_product / (magn_a * magn_b)
    
#type(vectorized_doc0)



In [21]:
#Testing 
processed_docs = [preprocess(doc) for doc in corpus1]
vectorized_doc0 = tf_idf(processed_docs[0], processed_docs)
vectorized_doc1 = tf_idf(processed_docs[1], processed_docs)
print(cosine_similarity(vectorized_doc0,vectorized_doc1))

0.04139854942538008


In [None]:
def matrix_similarity(corpus_a,corpus_b):
    unified_corpus = corpus_a.union(corpus_b)
    output_matrix = []
    