In [12]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')
nltk.download('stopwords')

# Example text input

[nltk_data] Downloading package punkt to /home/melak/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/melak/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:

long_text = ("In computer science, artificial intelligence (AI), sometimes called machine intelligence, "
             "is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans "
             "and animals. Leading AI textbooks define the field as the study of 'intelligent agents': any device that "
             "perceives its environment and takes actions that maximize its chance of successfully achieving its goals. "
             "Colloquially, the term 'artificial intelligence' is often used to describe machines (or computers) that "
             "mimic 'cognitive' functions that humans associate with the human mind, such as 'learning' and 'problem solving'. "
             "As machines become increasingly capable, tasks considered to require 'intelligence' are often removed from the "
             "definition of AI, a phenomenon known as the AI effect. A quip in Tesler's Theorem says 'AI is whatever hasn't "
             "been done yet.' For instance, optical character recognition is frequently excluded from things considered to be AI, "
             "having become a routine technology.") * 5  

#Preprocess 
def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(stemmed_tokens)

def calculate_cosine_similarity(slice1, slice2):
    vectorizer = CountVectorizer().fit([slice1, slice2])
    vectorized_slices = vectorizer.transform([slice1, slice2])
    similarity = cosine_similarity(vectorized_slices)[0][1]
    return similarity

def slice_input(input_text, slice_size=500, overlap=100):
    slices = []
    start = 0
    end = slice_size

    while start < len(input_text):
   
        current_slice = input_text[start:end]
        slices.append(current_slice)
        print(f"Slice {len(slices)}:\n{current_slice}\n")

        start = end - overlap
        end = start + slice_size

    return slices

sliced_demo_text = slice_input(long_text)
#len(sliced_demo_text), sliced_demo_text[:3]  



Slice 1:
In computer science, artificial intelligence (AI), sometimes called machine intelligence, is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans and animals. Leading AI textbooks define the field as the study of 'intelligent agents': any device that perceives its environment and takes actions that maximize its chance of successfully achieving its goals. Colloquially, the term 'artificial intelligence' is often used to describe machines (or computers

Slice 2:
s. Colloquially, the term 'artificial intelligence' is often used to describe machines (or computers) that mimic 'cognitive' functions that humans associate with the human mind, such as 'learning' and 'problem solving'. As machines become increasingly capable, tasks considered to require 'intelligence' are often removed from the definition of AI, a phenomenon known as the AI effect. A quip in Tesler's Theorem says 'AI is whatever hasn't been done yet.' For instance, optical c