In [7]:
import requests
import time
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

In [1]:
import concurrent

## Crawl data from Goodreads

In [7]:
goodread_df = pd.read_csv(
    r'D:\Online_Learning\Practical_DL\final_project\books.csv', on_bad_lines='skip')

In [12]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from concurrent.futures import ThreadPoolExecutor, as_completed


def fetch_data_for_isbn(isbn):
    return get_reviews_from_isbn(url, isbn)


def get_reviews_from_isbn(url, isbn):
    full_url = url+f"{isbn}"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    # Retry mechanism
    for attempt in range(3):  # Retry up to 3 times
        try:
            response = requests.get(
                full_url, headers=headers, allow_redirects=True, timeout=10)
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')
                break
        except requests.RequestException as e:
            print(f"Error fetching data for ISBN {isbn}: {e}")
            time.sleep(2 ** attempt)  # Exponential backoff
        else:
            print(
                f"Failed to fetch data after {attempt + 1} attempts. Status Code: {response.status_code}")
            return pd.DataFrame()  # Return empty DataFrame if unsuccessful

    # Parsing logic as before
    reviews = soup.find_all('article', class_='ReviewCard')
    all_reviews = []

    for review in reviews:
        # Modularize parsing into a function
        review_data = parse_review(review, isbn)
        all_reviews.append(review_data)

    return pd.DataFrame(all_reviews)


def parse_review(review, isbn):
    try:
        reviewer_name = review.find(
            'div', {'data-testid': 'name'}).get_text(strip=True)
        rating_section = review.find('div', class_='ShelfStatus')
        rating = rating_section.find('span', {'role': 'img'}).get(
            'aria-label', '').split()[1]
        comment_section = review.find(
            'div', {'data-testid': 'contentContainer'})
        comment = comment_section.get_text(
            strip=True) if comment_section else "No comment provided"
    except Exception as e:
        print(f"Error parsing review for ISBN {isbn}: {e}")
        return {'isbn': isbn, 'reviewer': '', 'rating': '', 'comment': ''}

    return {
        'isbn': isbn,
        'reviewer': reviewer_name,
        'rating': rating,
        'comment': comment
    }


def fetch_reviews_parallel(isbn_list):
    reviews_df = []
    # Reduced number of workers to lessen the load
    with ThreadPoolExecutor(max_workers=5) as executor:
        future_to_isbn = {executor.submit(
            fetch_data_for_isbn, isbn): isbn for isbn in isbn_list}
        for future in as_completed(future_to_isbn):
            isbn = future_to_isbn[future]
            try:
                data = future.result()
                reviews_df.append(data)
                print(f"Data fetched for ISBN {isbn} with shape {data.shape}")
                time.sleep(1)  # Throttle requests
            except Exception as exc:
                print(f"{isbn} generated an exception: {exc}")
    return reviews_df

In [None]:
url = f"https://www.goodreads.com/book/isbn/"
isbn_list = goodread_df['isbn'].unique()

reviews_dataframes = fetch_reviews_parallel(isbn_list)

In [35]:
test = pd.concat(reviews_dataframes)
test = test[test['reviewer'] != ""]
test.to_csv('reviews_goodread.csv', index=False)

In [36]:
test = pd.read_csv(
    r'D:\Online_Learning\Practical_DL\final_project\reviews_goodread.csv')
test.shape

(273442, 4)

In [37]:
test.head()

Unnamed: 0,isbn,reviewer,rating,comment
0,439358078,Jayson,5,(A) 86%| ExtraordinaryNotes:An angsty apprehen...
1,439358078,Navessa,5,"Seriously, don't read this review if you haven..."
2,439358078,Diane ϟ [ Lestrange ],5,Interview with JK Rowling...Stephen Fry:Can we...
3,439358078,Jayson,5,(A) 86%| ExtraordinaryNotes:It's a transitiona...
4,439358078,Hannah Azerang,5,I had to re read it. I was in such a nostalgic...


## Crawl news from ABC

In [6]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [1]:
import requests
import time
import pandas as pd
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
from urllib.robotparser import RobotFileParser
import random
import numpy as np

# User-agent rotation
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Safari/605.1.15',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:85.0) Gecko/20100101 Firefox/85.0',
]

def check_robots_txt(url):
    rp = RobotFileParser()
    rp.set_url(url + "/robots.txt")
    rp.read()
    return rp

def get_links_from_sgtimes(category_sgt, headers, num_limit, rp):
    full_url = "https://thesaigontimes.vn" + category_sgt

    if not rp.can_fetch(headers['User-Agent'], full_url):
        return []

    response = requests.get(full_url, headers=headers, timeout=10)
    soup = BeautifulSoup(response.text, 'html.parser')

    links = soup.find_all(
        'h3', class_='entry-title td-module-title', limit=num_limit)
    href_link = []
    for link in links:
        try:
            href_link.append(link.find('a')['href'])
            time.sleep(1)
        except:
            pass

    return list(set(href_link))

def get_links_from_cnn(category, headers, num_limit, rp):
    full_url = "https://edition.cnn.com" + category

    if not rp.can_fetch(headers['User-Agent'], full_url):
        return []

    response = requests.get(full_url, headers=headers, timeout=10)
    soup = BeautifulSoup(response.text, 'html.parser')

    links = set()
    page_number = 1

    while len(links) < num_limit:
        response = requests.get(full_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')

        for item in soup.find_all('a', class_='container__link'):
            links.add("https://edition.cnn.com" + item['href'])
            if len(links) >= num_limit:
                break

        page_number += 1
        time.sleep(1)  # Throttle requests

    return list(links)[:num_limit]

def get_links_from_bbc(category, headers, num_limit, rp):
    full_url = "https://www.bbc.com" + category

    if not rp.can_fetch(headers['User-Agent'], full_url):
        return []

    response = requests.get(full_url, headers=headers, timeout=10)
    soup = BeautifulSoup(response.text, 'html.parser')

    links = soup.find_all('a', {'data-testid':'internal-link'}, limit=num_limit)
    href_link = []
    for link in links:
        try:
            href_link.append("https://www.bbc.com" + link['href'])
            time.sleep(1)
        except:
            pass

    return list(set(href_link))

def get_text_from_link(url, headers, failed_links):
    try:
        response = requests.get(url, headers=headers, timeout=10)
        if response.status_code != 200:
            failed_links.append(url)
            return np.nan
        soup = BeautifulSoup(response.text, 'html.parser')
        paragraphs = soup.find_all('p')
        text_content = ' '.join(paragraph.get_text(strip=True) for paragraph in paragraphs)
        return text_content
    except Exception as e:
        failed_links.append(url)
        return np.nan

def get_texts_from_links_parallel(links, headers):
    failed_links = []
    results_dict = {}
    with ThreadPoolExecutor(max_workers=10) as executor:
        future_to_url = {executor.submit(
            get_text_from_link, url, headers, failed_links): url for url in links}
        for future in as_completed(future_to_url):
            url = future_to_url[future]
            try:
                result = future.result()
                results_dict[url] = result
                # Random delay to throttle requests
                time.sleep(random.uniform(1, 3))
            except Exception as exc:
                failed_links.append(url)
                results_dict[url] = np.nan
    results = [results_dict[url] for url in links]
    return results, failed_links

def run_parallel_processing(links, headers):
    texts, failed_links = get_texts_from_links_parallel(links, headers)
    df = pd.DataFrame({'url': links, 'txt': texts})
    return df, failed_links

if __name__ == "__main__":
    category_cnn = '/business'  # Example category for CNN
    category_sgt = '/tai-chinh-ngan-hang'  # Example category for Saigon Times
    category_bbc = '/innovation'  # Example category for BBC

    headers = {
        'User-Agent': random.choice(user_agents)
    }
    num_limit = 10

    # Check robots.txt
    cnn_rp = check_robots_txt("https://edition.cnn.com")
    sgt_rp = check_robots_txt("https://thesaigontimes.vn")
    bbc_rp = check_robots_txt("https://www.bbc.com")

    links_cnn = get_links_from_cnn(category_cnn, headers, num_limit, cnn_rp)
    links_sgt = get_links_from_sgtimes(category_sgt, headers, num_limit, sgt_rp)
    links_bbc = get_links_from_bbc(category_bbc, headers, num_limit, bbc_rp)

    df_txt, failed_links = run_parallel_processing(links_sgt+links_cnn+links_bbc, headers)
    df_txt.to_csv('scraped_texts.csv', index=False)


In [2]:
test = pd.read_csv('scraped_texts.csv')
test.head()

Unnamed: 0,url,txt
0,https://thesaigontimes.vn/cong-ty-chung-khoan-...,Kinh tế Sài Gòn Online Kinh tế Sài Gòn Online ...
1,https://thesaigontimes.vn/chinh-thuc-chuyen-gi...,Kinh tế Sài Gòn Online Kinh tế Sài Gòn Online ...
2,https://thesaigontimes.vn/do-dau-vang-nhan-vuo...,Kinh tế Sài Gòn Online Kinh tế Sài Gòn Online ...
3,https://thesaigontimes.vn/pho-thu-tuong-ho-duc...,Kinh tế Sài Gòn Online Kinh tế Sài Gòn Online ...
4,https://thesaigontimes.vn/vi-sao-chinh-phu-kho...,Kinh tế Sài Gòn Online Kinh tế Sài Gòn Online ...


In [7]:
print(test['url'][12])
print()
print(test['txt'][12])

https://edition.cnn.com/2024/09/17/economy/us-retail-sales-august/index.html

Markets Hot Stocks Fear & Greed Index Latest Market News Hot Stocks A key driver of the US economy remains solid. Spending at US retailers rose 0.1% in August from the prior month, the Commerce Department reported Tuesday. That’s a much slower pace than July’s upwardly revised 1.1% gain, but well above the 0.2% decline economists projected in a FactSet poll. The figures are adjusted for seasonal swings but not inflation. It’s an encouraging sign for America’s economy, since consumer spending represents two-thirds of US economic output. Retail sales make up a sizable chunk of overall spending. Tuesday’s report is the final major economic release beforethe Federal Reserve announces its latest interest-rate moveon Wednesday. The numbers do little to influence the size of the expected rate cut. The debate over whether the Fed will roll out a quarter-point rate cut, or a larger, half-point cut has intensified rece

In [9]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

## Base Model
MODEL_NAME = r"D:\Online_Learning\Practical_DL\bart_large_cnn"

model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

model.eval()

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to generate summary
def generate_summary(model, tokenizer, text, max_length=400, min_length=130):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
    inputs = inputs.to(device)

    # Generate summary
    summary_ids = model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        num_beams=4,
        early_stopping=True,
        max_length=max_length,
        min_length=min_length,
    )

    # Decode the generated summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [12]:
import textwrap

generated_summaries = [generate_summary(model, tokenizer, text) for text in test['txt']]
test['summaried'] = generated_summaries

for i in range(len(generated_summaries)):
    print(f"Original Text: {textwrap.wrap(test['url'][i], width=100)}")
    print("Generated Summary:")
    summary = test['summaried'][i]
    paragraphs = summary.split('\n')
    for paragraph in paragraphs:
        lines = textwrap.wrap(paragraph, width=100)
        for line in lines:
            print(line)
    print("\n")

Original Text: ['https://thesaigontimes.vn/de-xuat-ho-tro-tai-chinh-linh-hoat-cho-dia-phuong-bi-anh-huong-bao/']
Generated Summary:
Bộ Tài chính sẽ xem xét và đề xuất phương án hỗ trợ tài. Bão do bão và hoàn lưu bão, ăn bị thiệt hại
cho các  các  ĉnh giá lửa 2024. Các āXVN đã thực hiỉn đông cân  năm 2024. Trong tổng số 26
 ĉn hưởng, có 12  ‘’’ ‘”’, “”, ” ”   ,  h”.


Original Text: ['https://thesaigontimes.vn/vang-nhan-tung-buoc-xo-do-cac-ky-luc-ve-gia/']
Generated Summary:
Kinh tế Sài Gòn Online (KTSG Online) – Kim loại quý toàn cục mới. Giá vàng cũng duy trì vào sáng hôm
nay (15-9) Tỉnh rõ ràng về lãi suất. Hồng niễm yἿt giá  vàng nhẫn 77,88 – 79,08 triệu  mua  bán ra.


Original Text: ['https://thesaigontimes.vn/ty-gia-bien-dong-kho-luong-he-qua-nao-cho-doanh-nghiep/']
Generated Summary:
Kinh tế Sài Gòn Online (KTSG) – Trong bối cảnh đô la Mỹ, giá trở lại trong hai tháng qua. Ngoài ra,
sự biỉn động khó lường của yen Nhật cũng công cây lớn. 27,7% là mức tăng trưng ứng vồng và tơn
124

# Test with another model

In [1]:
import torch
import os
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.cuda.amp import autocast

# Set CUDA_LAUNCH_BLOCKING for debugging
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

# Define the model name
MODEL_NAME = r"D:\Online_Learning\Practical_DL\models\gemma_2_2b_it"

# Load the model and tokenizer
model_gemma = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.bfloat16,)
tokenizer_gemma = AutoTokenizer.from_pretrained(MODEL_NAME)

# Move model to the appropriate device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Using device: {device}")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using device: cuda


In [25]:
# Function to generate a response from the model
def generate_response(model, tokenizer, prompt, max_length=2048):
    # Tokenize the input prompt
    prompt = "Summarize this text, show only result: " + prompt

    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = inputs.to(device)
    print(f"Input tensor shape: {inputs['input_ids'].shape}")
    print(f"Input tensor dtype: {inputs['input_ids'].dtype}")


    response_ids = model.generate(
        **inputs,
        max_length=max_length,
        num_beams=2,
        early_stopping=True,
    )

    # Decode the generated response
    response = tokenizer.decode(response_ids[0])

    summary_start = response.find("**Summary:**")
    if summary_start != -1:
        summary_start += len("**Summary:**")
        summary_end = response.find("<eos>", summary_start)
        if summary_end != -1:
            summary = response[summary_start:summary_end].strip()
        else:
            summary = response[summary_start:].strip()
    else:
        summary = "Summary not found in the response."

    return summary

# Example question to ask the model
text = texts[-1]
# Generate response
response = generate_response(model_gemma, tokenizer_gemma, text)
print("Response:", response)

Input tensor shape: torch.Size([1, 1266])
Input tensor dtype: torch.int64
Response: Instagram is implementing new "teen account" settings to protect young users from online dangers. These settings will automatically make millions of teen accounts private, restrict content, and require parental approval for changes. The changes are a response to growing concerns about the platform's impact on teens and criticism of Meta for not doing enough to protect them.


In [1]:
import torch
import re
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForCausalLM
from torch.cuda.amp import autocast
from rouge_score import rouge_scorer

# Define the model names
MODEL_NAME_BART = r"D:\Online_Learning\Practical_DL\models\bart_large_cnn"
MODEL_NAME_GEMMA = r"D:\Online_Learning\Practical_DL\models\gemma_2_2b_it"

# Load the BART model and tokenizer
model_bart = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME_BART)
tokenizer_bart = AutoTokenizer.from_pretrained(MODEL_NAME_BART)
model_bart.eval()

# Load the GEMMA model and tokenizer
model_gemma = AutoModelForCausalLM.from_pretrained(MODEL_NAME_GEMMA, torch_dtype=torch.float16)
tokenizer_gemma = AutoTokenizer.from_pretrained(MODEL_NAME_GEMMA)
model_gemma.eval()

# Move models to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_bart.to(device)
model_gemma.to(device)
print(f"Using device: {device}")

# Function to generate summary using BART
def generate_summary_bart(model, tokenizer, text, max_length=1024):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
    inputs = inputs.to(device)

    # Generate summary
    summary_ids = model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        num_beams=2,
        early_stopping=True,
        max_length=max_length
    )

    # Decode the generated summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Function to generate summary using GEMMA
def generate_summary_gemma(model, tokenizer, prompt, max_length=2048):
    # Tokenize the input prompt
    prompt = "Summarize this text, show only result: " + prompt

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
    inputs = inputs.to(device)

    with autocast():
        response_ids = model.generate(
            **inputs,
            max_length=max_length,
            num_beams=2,
            early_stopping=True
        )

    # Decode the generated response
    response = tokenizer.decode(response_ids[0])

    match = re.search(r'\*\*Summary:\*\*|## Summary:', response)
    if match:
        summary_start = match.end()
        summary_end = response.find("<eos>", summary_start)
        if summary_end != -1:
            summary = response[summary_start:summary_end].strip()
        else:
            summary = response[summary_start:].strip()
    else:
        summary = "Summary not found in the response."

    return summary



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using device: cuda


In [2]:
import pandas as pd
# test = pd.read_csv('scraped_texts.csv')
# test.head()

# Load the first 10 texts from the CSV file
df = pd.read_csv('scraped_texts.csv')
df = df.loc[~df['url'].str.contains('thesaigontimes')]
texts = df['txt'][:10].tolist()
# Generate reference summaries using BART
reference_summaries = [generate_summary_bart(model_bart, tokenizer_bart, text) for text in texts]
print("Reference Summaries Finsihsed")

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Reference Summaries Finsihsed


In [3]:
# df_check = df.head(10).copy()
# df_check['reference_summaries'] = reference_summaries
# df_check.to_csv('reference_summaries.csv')
import pandas as pd
df_check = pd.read_csv('reference_summaries.csv')

In [5]:
# Function to evaluate summaries using ROUGE
def evaluate_summaries(reference, generated):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, generated)
    return scores


# Initialize lists to store scores
rouge1_gemma_scores = []
rouge2_gemma_scores = []
rougeL_gemma_scores = []

summaries = []

# Generate and evaluate summaries for the first 10 texts
for i in range(10):
    print(i)
    text = df_check['txt'].values[i]
    reference_summary =  df_check['reference_summaries'].values[i]

    # Generate summary using GEMMA
    summary_gemma = generate_summary_gemma(model_gemma, tokenizer_gemma, text)
    summaries.append(summary_gemma)

    # Evaluate summary
    scores_gemma = evaluate_summaries(reference_summary, summary_gemma)

    # Store scores
    rouge1_gemma_scores.append(scores_gemma['rouge1'].fmeasure)
    rouge2_gemma_scores.append(scores_gemma['rouge2'].fmeasure)
    rougeL_gemma_scores.append(scores_gemma['rougeL'].fmeasure)

# Calculate average ROUGE scores
avg_rouge1_gemma = sum(rouge1_gemma_scores) / len(rouge1_gemma_scores)
avg_rouge2_gemma = sum(rouge2_gemma_scores) / len(rouge2_gemma_scores)
avg_rougeL_gemma = sum(rougeL_gemma_scores) / len(rougeL_gemma_scores)

# Print average ROUGE scores
print("Average ROUGE scores for GEMMA compared to BART reference summaries:")
print(f"ROUGE-1: {avg_rouge1_gemma}")
print(f"ROUGE-2: {avg_rouge2_gemma}")
print(f"ROUGE-L: {avg_rougeL_gemma}")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


0


  attn_output = torch.nn.functional.scaled_dot_product_attention(


1


KeyboardInterrupt: 

In [46]:
# Function to preprocess input text
def preprocess_text(text):
    # Remove unnecessary information
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    text = re.sub(r'\[.*?\]', '', text)  # Remove content in brackets
    return text

# Function to post-process generated summaries
def postprocess_summary(summary):
    # Remove redundant information
    summary = re.sub(r'\s+', ' ', summary)  # Remove extra whitespace
    summary = re.sub(r'\[.*?\]', '', summary)  # Remove content in brackets
    return summary

# Function to generate summary using GEMMA
def generate_summary_gemma(model, tokenizer, prompt, max_length=1024, min_length=100, num_beams=2):
    # Tokenize the input prompt
    prompt = "Summarize this text, show only result: " + prompt

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
    inputs = inputs.to(device)

    # # Ensure inputs are in FP32 if using mixed precision
    # if inputs['input_ids'].dtype == torch.float16:
    #     inputs['input_ids'] = inputs['input_ids'].to(torch.float16)
    # if inputs['attention_mask'].dtype == torch.float16:
    #     inputs['attention_mask'] = inputs['attention_mask'].to(torch.float16)

    response_ids = model.generate(
        **inputs,
        max_length=max_length,
        min_length=min_length,
        num_beams=num_beams,
        early_stopping=True
    )

    # Decode the generated response
    response = tokenizer.decode(response_ids[0])

    # Extract the summary text using regular expressions
    match = re.search(r'\*\*Summary:\*\*|## Summary:', response)
    if match:
        summary_start = match.end()
        summary_end = response.find("<eos>", summary_start)
        if summary_end != -1:
            summary = response[summary_start:summary_end].strip()
        else:
            summary = response[summary_start:].strip()
    else:
        summary = "Summary not found in the response."

    return summary

## Test with Ollama

In [13]:
# Step 2: Import necessary modules
import ollama

# Step 3: Load the LLaMA model
model_name = "llama3.2:3b"
client = ollama.Client()
client.use_gpu = True

# Step 4: Define the summarization function
def summarize_text(text, model_name):
    # Prepare the input for the model
    input_text = f"summarize: {text}"
    
    # Generate the summary
    summary = client.generate(prompt=input_text, model=model_name, stream=False)
    
    return summary['response']
    # Ensure the client uses GPU if available
    

# Example usage
if __name__ == "__main__":
    text = """
    Iran on Tuesday launched a ballistic missile attack on Israel in retaliation for its recent killing of Hezbollah leader Hassan Nasrallah and an Iranian commander in Lebanon.

    The attack came on the heels of Israel’s deployment of ground forces into south Lebanon, escalating its offensive on Hezbollah, a militant group backed by Iran.

    “In response to the martyrdom of Martyr Haniyeh, Seyyed Hassan Nasrallah, and Martyr Nilforoushan, we have targeted the heart of the occupied territories,” Iran’s Revolutionary Guard said in a statement after missiles began appearing in the skies over Israel.

    “Should the Zionist regime respond to Iran’s operation, it will face crushing attack,” said the group, Iran’s paramilitary organization.

    Abbas Nilforoushan was an Iranian Revolutionary Guards deputy commander, who was killed with Nasrallah in a bombing by Israel last Friday in Beirut.

    Ismail Haniyeh was the political commander of the Hamas terror group, who was killed in July by an Israeli strike on Tehran, the capital of Iran. Israel has been engaged in a brutal war on Hamas since the group launched the Oct. 7 terror attack on Israel from Gaza.
    """
    summary = summarize_text(text, model_name)
    print("Summary:", summary)

Summary: Here's a summary:

Iran retaliated against Israel with a ballistic missile attack after the killing of two high-profile targets: Hassan Nasrallah, leader of Hezbollah, and Abbas Nilforoushan, an Iranian Revolutionary Guards deputy commander. The attacks were in response to Israel's deployment of ground forces into south Lebanon as part of its ongoing war against Hamas. Iran threatened further retaliation if Israel responded, labeling them "crushing". The conflict is escalating between Israel and Iran's allies, with tensions running high after a string of recent deadly strikes on Iranian targets in Lebanon and Gaza.


In [16]:
import os
import torch
import pinecone
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForCausalLM, pipeline
from dotenv import load_dotenv
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain.memory import ConversationSummaryMemory
from langchain.chains import ConversationalRetrievalChain
from langchain_ollama import ChatOllama

# Load environment variables
load_dotenv(r"D:\Online_Learning\Practical_DL\langchain_udemy\ice_breaker\.env")

# Retrieve the API key and index name from environment variables
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
INDEX_NAME = os.getenv("INDEX_NAME")

llm = ChatOllama(model="llama3.2:3b", use_gpu=True, max_tokens=1024, temperature=0.7, top_p=0.9)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Initialize Pinecone vector store
docsearch = PineconeVectorStore(index_name=INDEX_NAME, embedding=embeddings)

# Initialize conversation memory with summary
# Load environment variables
load_dotenv(r"D:\Online_Learning\Practical_DL\langchain_udemy\ice_breaker\.env")

# Retrieve the API key and index name from environment variables
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
INDEX_NAME = os.getenv("INDEX_NAME")

llm = ChatOllama(model="llama3.2:3b", use_gpu=True, max_tokens=1024, temperature=0.7, top_p=0.9)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Initialize Pinecone vector store
docsearch = PineconeVectorStore(index_name=INDEX_NAME, embedding=embeddings)

# Initialize conversation memory with summary
memory = ConversationSummaryMemory(llm=llm, memory_key="chat_history", return_messages=True)


In [17]:
from pinecone import Pinecone, ServerlessSpec

# Initialize Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)

# Create a conversational chain
qa = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=docsearch.as_retriever(),
    memory=memory
)

# Chatbot loop
print("Chatbot: Hello! How can I assist you today?")
while True:
    user_input = input("You: ")
    if user_input.lower() in ['exit', 'quit', 'bye']:
        print("Chatbot: Goodbye!")
        break
    
    response = qa({"question": user_input})
    wrapped_response = textwrap.fill(response['answer'], width=20)
    print("Chatbot:", wrapped_response)

Chatbot: Hello! How can I assist you today?
Chatbot: One-hot encoding is a method used to convert categorical data into numerical data, which can be fed into machine learning algorithms. It works by creating a binary vector for each category in the data, where one element in the vector is set to 1 and all others are set to 0.

For example, if you have a dataset with text data and you want to convert it into numerical features using one-hot encoding, you would create a separate feature for each word in the vocabulary. If a document contains the word "hello", its corresponding feature vector would be [1, 0, ..., 0] where the first element is 1 (indicating that the word "hello" exists in the document) and all other elements are 0.

However, as mentioned in the text you provided, one-hot encoding has some limitations, such as introducing multicollinearity which can be an issue for certain models.
Chatbot: A syntactic parser is a program that decomposes sentences into a parse tree, which co

# Test With Gemma 2 

In [10]:
import os
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForCausalLM, pipeline
from dotenv import load_dotenv
import pinecone
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain


# Load environment variables
load_dotenv(r"D:\Online_Learning\Practical_DL\langchain_udemy\ice_breaker\.env")

# Retrieve the API key and index name from environment variables
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
INDEX_NAME = os.getenv("INDEX_NAME")
MODEL_NAME_GEMMA = r"D:\Online_Learning\Practical_DL\models\gemma_2_2b_it"

# Load the GEMMA model and tokenizer
model_gemma = AutoModelForCausalLM.from_pretrained(MODEL_NAME_GEMMA, torch_dtype=torch.float16)
tokenizer_gemma = AutoTokenizer.from_pretrained(MODEL_NAME_GEMMA)
model_gemma.eval()

# Move models to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_gemma.to(device)
print(f"Using device: {device}")


# Initialize embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Initialize Pinecone vector store
docsearch = PineconeVectorStore(index_name=INDEX_NAME, embedding=embeddings)

# Initialize conversation memory
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using device: cuda


In [15]:
from langchain.chains import ConversationalRetrievalChain
from transformers import pipeline
from langchain_huggingface.llms import HuggingFacePipeline

# Initialize the text generation pipeline
generation_pipeline = pipeline(
    "text-generation",
    model=model_gemma,
    tokenizer=tokenizer_gemma,
    device=0,  # Automatically select the device
    max_new_tokens=200,
    temperature=0.7,
    top_p=0.9
)

# Wrap the pipeline with HuggingFacePipeline
chat = HuggingFacePipeline(pipeline=generation_pipeline)

# Create a conversational chain
qa = ConversationalRetrievalChain.from_llm(
    llm=chat,
    retriever=docsearch.as_retriever(),
    memory=memory
)

# Chatbot loop
print("Chatbot: Hello! How can I assist you today?")
while True:
    user_input = input("You: ")
    if user_input.lower() in ['exit', 'quit', 'bye']:
        print("Chatbot: Goodbye!")
        break

    response = qa({"question": user_input})
    print("Chatbot:", response['answer'])


Chatbot: Hello! How can I assist you today?




KeyboardInterrupt: 