<a href="https://colab.research.google.com/github/AnahitaNouri/NLP-assignment-No.2/blob/main/Assignment_No_2_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install faker

Collecting faker
  Downloading Faker-22.2.0-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: faker
Successfully installed faker-22.2.0


In [3]:
import random
import string
from faker import Faker
import os

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

fake = Faker()
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
# Generates a random sentence using the Faker library.
def generate_random_sentence():
    return fake.sentence()

# Creates a dummy text file with a specified size
def create_dummy_text_file(file_path, file_size_mb):
    print(f"Creating a dummy text file with a size of {file_size_mb} MB...")
    target_size_bytes = file_size_mb * 1024 * 1024
    current_size_bytes = 0

    with open(file_path, 'w', encoding='utf-8') as file:
        while current_size_bytes < target_size_bytes:
            random_sentence = generate_random_sentence()
            file.write(random_sentence + ' ')
            current_size_bytes += len(random_sentence.encode('utf-8'))

    print("Dummy text file creation completed.")


# Performs text preprocessing, including tokenization, stopword removal, and stemming
def preprocess_text(text):
    print("Performing text preprocessing:")

    stop_words = set(stopwords.words('english'))
    porter = PorterStemmer()

    # Tokenization
    print(" - Tokenizing...")
    words = word_tokenize(text)

    # Stopword removal and stemming
    print(" - Removing stopwords and stemming...")
    filtered_words = [porter.stem(word) for word in words if word.lower() not in stop_words]

    # Joining the processed words
    processed_text = " ".join(filtered_words)

    print("Text preprocessing completed.")

    return processed_text


# Calculates the cosine distance between two input texts
def calculate_cosine_distance(text1, text2):
    print("Calculating cosine distance:")

    # Preprocess both texts
    text1 = preprocess_text(text1)
    text2 = preprocess_text(text2)

    # Vectorization
    print(" - Vectorizing texts...")
    vectorizer = CountVectorizer().fit_transform([text1, text2])
    vectors = vectorizer.toarray()

    # Cosine similarity calculation
    print(" - Calculating cosine similarity...")
    similarity_matrix = cosine_similarity(vectors)
    cosine_distance = similarity_matrix[0][1]

    print("Cosine distance calculation completed.")

    return cosine_distance


# Generates slices of input text based on specified criteria
def generate_slices(input_text, context_window_size_mb=128, overlap_threshold=0.2):
    print("Generating slices...")

    context_window_size = context_window_size_mb * 1024 * 1024  # Convert to bytes
    input_size = len(input_text)

    print(f"Input Size: {input_size} bytes")
    if input_size <= context_window_size:
        # Below the context window size, pass it as it is
        print("Input is below context window size. Passing as it is.")
        return [input_text]

    print("Input is above context window size. Generating slices...")

    slices = []
    start = 0
    end = context_window_size

    while start < input_size:
        slice_text = input_text[start:end]

        if end < input_size:
            next_start = end - int(overlap_threshold * context_window_size)
            next_end = min(next_start + context_window_size, input_size)
            next_slice_text = input_text[next_start:next_end]

            distance = calculate_cosine_distance(slice_text, next_slice_text)

            if distance <= overlap_threshold:
                # Include the overlapping part in the next slice
                slice_text = input_text[start:next_end]
                start = next_end
            else:
                start = end
        else:
            # Last slice, include the remaining part
            start = end

        slices.append(slice_text)
        print(f"Generated Slice - Size: {len(slice_text)} bytes", end='\r')

        end = min(start + context_window_size, input_size)

    print("\nSlice generation completed.")
    return slices


# Saves slices to individual files in the specified directory
def save_slices_to_files(slices, output_directory):
    print(f"Saving slices to files in the directory: {output_directory}...")

    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    for i, slice_text in enumerate(slices):
        print(f"Saving Slice {i + 1} to file...")
        file_path = os.path.join(output_directory, f'slice_{i + 1}.txt')
        with open(file_path, 'w+', encoding='utf-8') as file:
            file.write(slice_text)

    print("Slice saving completed.")

#usage
file_path = '/content/drive/MyDrive/Dummy/dummy_file.txt'
output_directory = '/content/drive/MyDrive/slices_output'
file_size_mb = 130

create_dummy_text_file(file_path, file_size_mb)

with open(file_path, 'r', encoding='utf-8') as file:
    input_text = file.read()

slices = generate_slices(input_text)
print("\nNumber of slices:", len(slices))
print("\nSlices:")
for i, slice_text in enumerate(slices):
    print(f"Slice {i + 1} - Size: {len(slice_text)} bytes")

save_slices_to_files(slices, output_directory)


Creating a dummy text file with a size of 130 MB...
Dummy text file creation completed.
Generating slices...
Input Size: 140101150 bytes
Input is above context window size. Generating slices...
Calculating cosine distance:
Performing text preprocessing:
 - Tokenizing...
 - Removing stopwords and stemming...
Text preprocessing completed.
Performing text preprocessing:
 - Tokenizing...
 - Removing stopwords and stemming...
Text preprocessing completed.
 - Vectorizing texts...
 - Calculating cosine similarity...
Cosine distance calculation completed.
Generated Slice - Size: 5883422 bytes
Slice generation completed.

Number of slices: 2

Slices:
Slice 1 - Size: 134217728 bytes
Slice 2 - Size: 5883422 bytes
Saving slices to files in the directory: /content/drive/MyDrive/slices_output...
Saving Slice 1 to file...
Saving Slice 2 to file...
Slice saving completed.
