In [None]:
# managing the LLMs

"""

from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir("/content/drive/MyDrive/Colab Notebooks/LSSP_code/project")

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

! pip install --quiet apache-beam
! pip install --quiet transformers


def make_model(model_name):
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer

def save_model(model, token, folder):
    model.save_pretrained(os.path.join(folder, "model"))
    tokenizer.save_pretrained(os.path.join(folder, "tokenizer"))

def load_model(folder):
    model = AutoModelForSeq2SeqLM.from_pretrained(os.path.join(folder, "model"))
    tokenizer = AutoTokenizer.from_pretrained(os.path.join(folder, "tokenizer"))
    return model, tokenizer

model = 'facebook/nllb-200-distilled-1.3B'

model, tokenizer = make_model(model)

save_model(model, tokenizer, "data")

model, tokenizer = load_model("data")

"""

In [None]:
# evaluation metrics for LLMs

"""
import numpy as np
from datetime import datetime

def get_default_entry():
    now = datetime.now()
    return now.strftime("%a %d/%m/%Y %H:%M:%S %Z").upper()


class Evaluate():

    def __init__(self, save_path="./evaluation_metrics.txt", verbose=False, decimals=5):
        self.path = save_path
        self.verbose = verbose
        self.decimal = decimals

        self.data = []
        self.temp = ""



    def make_new_entry(self, data=""):
        if self.temp != "":
            self.data.append(self.temp)

        self.temp = f"{get_default_entry()} >> {data} >> "



    def jaccard_similarity(self, text1, text2):
        text1 = np.array(text1.lower().split(" "))
        text2 = np.array(text2.lower().split(" "))

        intersection = len(np.intersect1d(text1, text2))
        union = len(np.union1d(text1, text2))

        j_similarity = round(float(intersection/union), self.decimal)
        self.temp = self.temp + str(j_similarity)
        if self.verbose : print(self.temp)

        return j_similarity



    def cosine_similarity(self, text1, text2):
        text1 = text1.lower().split(" ")
        text2 = text2.lower().split(" ")

        unique_words = set(text1 + text2)

        vector1 = np.array([text1.count(word) for word in unique_words])
        vector2 = np.array([text2.count(word) for word in unique_words])

        dot_product = np.dot(vector1, vector2)
        magnitude1 = np.sqrt(np.sum(vector1 ** 2))
        magnitude2 = np.sqrt(np.sum(vector2 ** 2))

        cosine_sim = round(float(dot_product / (magnitude1 * magnitude2)), self.decimal)
        self.temp = self.temp + str(cosine_sim)
        if self.verbose : print(self.temp)

        return cosine_sim



    def eucledian_distance(self, text1, text2):
        text1 = text1.lower().split(" ")
        text2 = text2.lower().split(" ")

        unique_words = set(text1 + text2)

        vector1 = np.array([text1.count(word) for word in unique_words])
        vector2 = np.array([text2.count(word) for word in unique_words])

        eucledian_dist = round(float(np.sqrt(np.sum((vector1 - vector2) ** 2))), self.decimal)
        self.temp = self.temp + str(eucledian_dist)
        if self.verbose : print(self.temp)

        return eucledian_dist



    def save_data(self):
        with open(self.path, "w") as file:
            for line in self.data:
                file.write(line + '\n')

            file.close()

"""

'THU 11/04/2024 19:07:20 '

In [1]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir("/content/drive/MyDrive/Colab Notebooks/LSSP_code/project")

Mounted at /content/drive


In [2]:
! pip install --quiet apache-beam

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.8/14.8 MB[0m [31m47.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.7/89.7 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m144.8/144.8 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m152.0/152.0 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m72.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m65.3 MB/s[0m

In [14]:
! python testing.py

output-00000-of-00001.txt


In [12]:
! cat dataset.txt

In the vast expanse of the cosmos, among the countless stars and galaxies that stretch beyond the reaches of imagination, there exists a tapestry of wonder and mystery.
Within this cosmic quilt, celestial bodies dance to the rhythm of gravity's embrace, weaving intricate patterns across the velvet canvas of space.
Nebulas swirl with colors unseen on Earth, their ethereal beauty captivating the gaze of distant observers.
Planets, each a world unto itself, orbit their parent stars in a delicate ballet of motion and time.
Moons, silent sentinels in the night sky, watch over their planetary companions with quiet vigilance.
Comets streak through the darkness, leaving trails of stardust in their wake as they journey through the cold depths of space.
Asteroids, remnants of ancient collisions, drift silently through the void, silent witnesses to the tumultuous history of the cosmos.
In the depths of the cosmos, black holes lurk like cosmic predators, their gravitational pull devouring everythi

In [15]:
! cat output-00000-of-00001.txt

[[[[[[[[[[[[[[[[[['In the vast expanse of the cosmos, among the countless stars and galaxies that stretch beyond the reaches of imagination, there exists a tapestry of wonder and mystery.', "Within this cosmic quilt, celestial bodies dance to the rhythm of gravity's embrace, weaving intricate patterns across the velvet canvas of space."], 'Nebulas swirl with colors unseen on Earth, their ethereal beauty captivating the gaze of distant observers.'], 'Planets, each a world unto itself, orbit their parent stars in a delicate ballet of motion and time.'], 'Moons, silent sentinels in the night sky, watch over their planetary companions with quiet vigilance.'], 'Comets streak through the darkness, leaving trails of stardust in their wake as they journey through the cold depths of space.'], 'Asteroids, remnants of ancient collisions, drift silently through the void, silent witnesses to the tumultuous history of the cosmos.'], 'In the depths of the cosmos, black holes lurk like cosmic predator