In [None]:
! pip install langchain

### Imports and Downloads

In [None]:
import re
import os
import math
import nltk
import torch
import spacy
import getpass
import warnings
import transformers
from nltk.corpus import stopwords
from langchain import PromptTemplate
from nltk.tokenize import word_tokenize
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory
from sklearn.feature_extraction.text import TfidfVectorizer
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer, pipeline

warnings.filterwarnings("ignore", category=DeprecationWarning)
transformers.logging.set_verbosity_warning()

nltk.download('punkt')
nltk.download('stopwords')

Lemmatizer = spacy.load("en_core_web_sm")

### Setting Up and Using the Hugging Face API for Text Generation

In [3]:
os.environ["HUGGINGFACEHUB_API_TOKEN"] = getpass.getpass("Place your token here: ")
Template = """given the Topic below, generate text about this topic in details.

Current conversation:
{history}

Topic:
{input}

Response: """

PROMPT = PromptTemplate(
                        template=Template,
                        input_variables=["history", "input"])

Place your token here:  ·····································


### Loading and Configuring the Vicuna Model for Text Generation

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    model_id = "/kaggle/input/vicuna/pytorch/7b-v1pt5/1"
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(model_id, temperature=1.0, torch_dtype = torch.float16, device_map = "auto", offload_folder = "./cache")
    streamer = TextStreamer(tokenizer, skip_prompt=True)
    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512,streamer=streamer, return_full_text=False)
    llm = HuggingFacePipeline(pipeline=pipe)

### Creating a Conversation Chain for Text Generation with Memory

In [5]:
generation = ConversationChain(
    llm=llm,
    prompt=PROMPT,
    verbose=False, 
    memory=ConversationBufferMemory(ai_prefix="Response")
)

### Generating Three Different Topic Documents

In [6]:
doc1 = generation("cars")['response']
doc2 = generation("shopping")['response']
doc3 = generation("anime")['response']
Docs = [doc1, doc2, doc3]









Shopping is a fun and exciting experience for many people. It allows us to explore new products and brands, and to find items that we need or want for our lives. Whether we are shopping in person or online, there are endless options available to us. From clothing and accessories to electronics and home goods, the possibilities are endless. Some people enjoy shopping as a leisure activity, while others use it as a practical way to get the things they need. No matter what our reasons for shopping are, it is always a great way to treat ourselves or to find unique gifts for others.</s>





Anime is a genre of Japanese animation that has gained popularity around the world. It is known for its unique style, which often features exaggerated facial expressions, colorful graphics, and fantastical themes. Many anime series are based on manga, or Japanese comics, and often explore themes of friendship, love, and personal growth. Some of the most popular anime series include "Naruto," "Dragon Ball Z," and "One Piece." In addition to television shows and movies, anime has also become popular in other forms of media, such as video games and merchandise. As someone who loves anime, I am always excited to see what new series or movies will be released in the future.</s>


### Defining Preprocessing Functions

In [74]:
def Clean_Text(Text):
    Cleaned_Text = re.sub(r'[^\w\s]', '', Text)
    Cleaned_Text = re.sub(r'\d+', '', Cleaned_Text)
    Cleaned_Text = Cleaned_Text.lower()
    return Cleaned_Text

def Tokenize_Text(Text):
    tokens = word_tokenize(Text)
    return tokens

def Lemmatize_Text(tokens):
    Lemmatized_Tokens = []
    for token in tokens:
        if Lemmatizer(token)[0].lemma_ != '-PRON-':
            Lemmatized_Token = Lemmatizer(token)[0].lemma_
        else:
            Lemmatized_Token = token
        Lemmatized_Tokens.append(Lemmatized_Token)
    return Lemmatized_Tokens

def Remove_Stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    custom_stop_words = ['I', 'z']
    stop_words.update(custom_stop_words)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    return filtered_tokens

def Get_Unique_Words(text):
    Unique_Words = set(text)
    Sorted_Words = sorted(Unique_Words)
    return ' '.join(Sorted_Words)

### Filtering the Generated Documents

In [75]:
Filtered_Docs = []
for i in range (len(Docs)):
    Cleaned_Text = Clean_Text(Docs[i])
    Tokens = Tokenize_Text(Cleaned_Text)
    Lemmatized_Tokens = Lemmatize_Text(Tokens)
    Filtered_Tokens = Remove_Stopwords(Lemmatized_Tokens)
    Unique_Words = Get_Unique_Words(Filtered_Tokens)
    Unique_Words = Unique_Words.split('\n')
    Filtered_Unique = [word for word in Unique_Words if len(word) > 2] # Remove words with less than 2 characters
    Filtered_Text = ' '.join(Filtered_Unique)
    Filtered_Docs.append(Filtered_Text)

### Calculating the TF-IDF for Each Document Using Built-in TfidfVectorizer Function

In [48]:
vectorizer = TfidfVectorizer()

tfidf_matrix = vectorizer.fit_transform(Filtered_Docs)

tfidf_matrix_dense = tfidf_matrix.toarray()

feature_names = vectorizer.get_feature_names_out()

for i, doc in enumerate(Filtered_Docs):
    print(f"TF-IDF for Document {i+1}:")
    for j, feature in enumerate(feature_names):
        tfidf_value = tfidf_matrix_dense[i, j]
        if tfidf_value > 0:
            print(f"{feature}: {tfidf_value:.4f}")
    print()

TF-IDF for Document 1:
add: 0.1483
advance: 0.1483
also: 0.1128
always: 0.0876
automatic: 0.1483
brake: 0.1483
car: 0.1483
collect: 0.1483
come: 0.1483
departure: 0.1483
emergency: 0.1483
environmentally: 0.1483
ever: 0.1483
evolve: 0.1483
example: 0.1483
excite: 0.1128
feature: 0.1128
freedom: 0.1483
friendly: 0.1483
fuelefficient: 0.1483
future: 0.1128
hobby: 0.1483
independence: 0.1483
innovation: 0.1483
introduce: 0.1483
lane: 0.1483
like: 0.1483
love: 0.1128
new: 0.0876
offroad: 0.1483
people: 0.1128
practical: 0.1128
purpose: 0.1483
rugged: 0.1483
safety: 0.1483
see: 0.1128
shape: 0.1483
size: 0.1483
sleek: 0.1483
someone: 0.1128
sport: 0.1483
symbol: 0.1483
technology: 0.1483
time: 0.1483
today: 0.1483
transportation: 0.1483
use: 0.1128
vehicle: 0.1483
warn: 0.1483
work: 0.1483
year: 0.1483

TF-IDF for Document 2:
accessory: 0.1598
activity: 0.1598
allow: 0.1598
always: 0.0944
available: 0.1598
brand: 0.1598
clothing: 0.1598
electronic: 0.1598
endless: 0.1598
enjoy: 0.1598
excit

### Implementing TF-IDF from Scratch

In [None]:
def calculate_tf(documents):
    tf_dicts = []
    for doc in documents:
        tf_dict = {}
        for word in doc.split():
            tf_dict[word] = tf_dict.get(word, 0) + 1
        tf_dicts.append(tf_dict)
    return tf_dicts

def calculate_idf(documents):
    num_documents = len(documents)
    idfs = {}
    for doc in documents:
        for word in doc.split():
            idfs[word] = idfs.get(word, 0) + 1

    for word, df in idfs.items():
        idfs[word] = math.log(1 + num_documents / (1 + df)) + 1 
    return idfs

def calculate_tfidf(tf_dicts, idfs):
    tfidf_matrix = []
    for tf_dict in tf_dicts:
        tfidf_doc = {}
        for word, tf in tf_dict.items():
            tfidf_doc[word] = tf * idfs[word]
        tfidf_matrix.append(tfidf_doc)
    return tfidf_matrix

def normalize_tfidf(tfidf_matrix):
    for doc_tfidf in tfidf_matrix:
        squared_sum = sum(value**2 for value in doc_tfidf.values())
        doc_norm = math.sqrt(squared_sum)
        for word, tfidf in doc_tfidf.items():
            doc_tfidf[word] = tfidf / doc_norm
    return tfidf_matrix

### Calculating TF-IDF for Each Document Using the Previous Handcrafted Functions

In [76]:
tf_values = calculate_tf(Filtered_Docs)
idf_values = calculate_idf(Filtered_Docs)
tfidf_matrix = calculate_tfidf(tf_values, idf_values)
normalzied_tfidf_matrix = normalize_tfidf(tfidf_matrix)

# Get feature names
feature_names = []
for doc in Filtered_Docs:
    feature_names.extend(doc.split())
feature_names = list(set(feature_names)) 
feature_names.sort()

# Print TF-IDF results 
for i, doc in enumerate(Filtered_Docs):
    print(f"TF-IDF for Document {i+1}:")
    for j, feature in enumerate(feature_names):
        tfidf_value = tfidf_matrix[i].get(feature, 0)
        if tfidf_value > 0:
            print(f"{feature}: {tfidf_value:.4f}")
    print()

TF-IDF for Document 1:
add: 0.1441
advance: 0.1441
also: 0.1274
always: 0.1173
automatic: 0.1441
brake: 0.1441
car: 0.1441
collect: 0.1441
come: 0.1441
departure: 0.1441
emergency: 0.1441
environmentally: 0.1441
ever: 0.1441
evolve: 0.1441
example: 0.1441
excite: 0.1274
feature: 0.1274
freedom: 0.1441
friendly: 0.1441
fuelefficient: 0.1441
future: 0.1274
hobby: 0.1441
independence: 0.1441
innovation: 0.1441
introduce: 0.1441
lane: 0.1441
like: 0.1441
love: 0.1274
new: 0.1173
offroad: 0.1441
people: 0.1274
practical: 0.1274
purpose: 0.1441
rugged: 0.1441
safety: 0.1441
see: 0.1274
shape: 0.1441
size: 0.1441
sleek: 0.1441
someone: 0.1274
sport: 0.1441
symbol: 0.1441
technology: 0.1441
time: 0.1441
today: 0.1441
transportation: 0.1441
use: 0.1274
vehicle: 0.1441
warn: 0.1441
work: 0.1441
year: 0.1441

TF-IDF for Document 2:
accessory: 0.1562
activity: 0.1562
allow: 0.1562
always: 0.1271
available: 0.1562
brand: 0.1562
clothing: 0.1562
electronic: 0.1562
endless: 0.1562
enjoy: 0.1562
excit