## **LLM FINE TUNNNING ON PERSONAL DATA SET**

### **Part I: Webscrapper**

In [None]:
# Imports for WebScrapper
import os
import requests
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from urllib.parse import urljoin

 - **Web Scrapper to extract the links to Articles**

In [None]:
# Set up Selenium WebDriver
driver = webdriver.Chrome()
# Target URL
driver.get("https://scholar.google.com/citations?user=unGWVYMAAAAJ&hl=en")

# Wait for the search results to load
WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, "//*[@class='gsc_a_t']/a")))

# Define a function to extract PDF links from a given page
def extract_pdf_links(page_source):
    soup = BeautifulSoup(page_source, 'html.parser')
    all_links = soup.find_all('a')
    pdf_links = [pdf_link['href'] for pdf_link in all_links if pdf_link.get('href') and ('.pdf' in pdf_link['href'] or 'pdf' in pdf_link['href'].lower())]
    return pdf_links

# Scroll to the end of page to load all results
show_more_button = driver.find_element(By.XPATH, "//*[@id='gsc_bpf_more']//*[@class='gs_lbl']")
show_more_button.click()
time.sleep(2)

# Find all search result links
search_result_links = driver.find_elements(By.XPATH, "//*[@class='gsc_a_t']/a")

# List to store all PDF links
all_pdf_links = []

# Loop through each link
for link in search_result_links:
    # Get the href attribute of the link
    href = link.get_attribute("href")
    # Store the link URL before clicking
    link_url = href
    # Open the link in a new tab using JavaScript to avoid navigation
    driver.execute_script("window.open('" + link_url + "');")
    # Switch to the newly opened tab
    driver.switch_to.window(driver.window_handles[-1])
    # Wait for the page to load
    WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, "//*")))
    # Get the page source
    page_source = driver.page_source
    # Extract PDF links from the page source
    pdf_links = extract_pdf_links(page_source)
    # Print PDF links
    if pdf_links:
        print("PDF links on", href, ":", pdf_links)
        all_pdf_links.extend(pdf_links)
        
        # Remove dulicate links
        all_pdf_links = list(set(all_pdf_links))
        
    else:
        print("No PDF links found on", href)
    # Close the current tab
    driver.close()
    # Switch back to the main tab
    driver.switch_to.window(driver.window_handles[0])

# Close the browser
driver.quit()

- **Downloading and Renaming the Articles**

In [None]:
# write links to file
with open('pdf_links.txt', 'w') as f:
    for pdf_link in all_pdf_links:
        f.write(pdf_link + '\n')
        
# Create folder for files
folder_path = "pdf_folder"
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# read links from file
with open('pdf_links.txt', 'r') as f:
    all_pdf_links = f.read().splitlines()
    
# Initialize counter
file_counter = 1

# Download PDF files
for pdf_link in all_pdf_links:
    if pdf_link.startswith('http'):
        pdf_url = pdf_link
    else:
        pdf_url = urljoin(href, pdf_link)
    
    # Download PDF file
    response = requests.get(pdf_url)
    
    # Ensure the response is successful
    if response.status_code == 200:
        # Determine file name
        file_name = os.path.join(folder_path, f"{file_counter}.pdf")
        
        # Save the PDF file
        with open(file_name, 'wb') as f:
            f.write(response.content)
        
        print("Downloaded:", file_name)
        
        # Increment counter for the next file
        file_counter += 1
    else:
        print(f"Failed to download {pdf_url}. Status code: {response.status_code}")

### **----------------------------------------------------------------------------**

### **Part II: Data Preprocessing**

In [None]:
# Imports for PDF to Text
import os
import fitz
import re

In [None]:
import fitz  # PyMuPDF
import os
import re

def clean_text(text):
    """Clean text by removing unwanted lines and references."""
    text = re.sub(r'\[(.*?)\]', '', text)  # Remove reference notations
    lines = text.split('\n')
    return [line for line in lines if not line.lower().startswith(('fig', 'fig.', 'figure', 'table'))]

def join_lines(lines):
    """Join lines properly to maintain paragraph integrity."""
    filtered_lines = []
    prev_line = ""
    for line in lines:
        if prev_line and not prev_line.endswith('.') and not line.startswith((' ', '\t')):
            filtered_lines[-1] += ' ' + line
        else:
            filtered_lines.append(line)
        prev_line = line
    return '\n\n'.join(filtered_lines)  # Double newline to signify paragraph end

def extract_text_from_pdf(pdf_file, output_txt):
    text_between_sections = ""
    abstract_started = False
    references_started = False
    try:
        with fitz.open(pdf_file) as doc:
            for page in doc:
                for table in page.find_tables():
                    page.add_redact_annot(table.bbox)
                page.apply_redactions()
                text = page.get_text("text")

                if "abstract" in text.lower() and not abstract_started:
                    abstract_started = True
                    text = text[(text.lower()).index("abstract"):]

                if "REFERENCES" in text and abstract_started:
                    references_started = True
                    text = text[:text.index("REFERENCES")]

                if abstract_started:
                    cleaned_lines = clean_text(text)
                    text_between_sections += join_lines(cleaned_lines)

                if abstract_started and references_started:
                    break

        with open(output_txt, 'w', encoding='utf-8') as txt_file:
            txt_file.write(text_between_sections)
    except Exception as e:
        logging.error(f"Failed to process {pdf_file}: {str(e)}")

def process_pdf_folder(folder_path):
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, filename)
            output_txt = os.path.splitext(pdf_path)[0] + ".txt" 
            extract_text_from_pdf(pdf_path, output_txt)
            logging.info(f"Processed {filename}")

process_pdf_folder("Pdf_folder")

### **Part III: Preparing Dataset**

In [None]:
# Merging Data to get one file
import os

def load_and_concatenate_txt(directory):
    all_texts = []
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
                all_texts.append(text.strip())
    return '\n\n'.join(all_texts)  # Join all articles with double newlines

# Specify the directory where your text files are stored
directory = 'pdf_folder/TextFiles'
full_corpus = load_and_concatenate_txt(directory)

# Save the full corpus to a text file
output_file = 'output.txt'
with open(output_file, 'w', encoding='utf-8') as file:
    file.write(full_corpus)

In [None]:
import json
from openai import OpenAI
import json
import time
from api import API_KEY
client = OpenAI(api_key=API_KEY)  

def label_data(data):
    
    prompt = f"""
    you will write a neutral text for the provided text. 
    Text: "
       {data}
       "
Note: Neutral text means how you will write about the same topic, discussed in paragraph. Write the text in your style which will cover all the things and I'm calling your written Text "Neutral Text ". So your task is to write a neutral text other things are for your assistant. Write like its your own text not mention words like "the paragraph discusses." you will write like you are writing for first time   

Response: response will only contain neutral text no explanation nothing else. 
    """
    
    
    
    completion = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": ""},
        {"role": "user", "content": prompt}
    ]
    )
    
    response = completion.choices[0].message.content  
    return response

def append_to_json_and_file(data, label, json_file_path, data_file_path):
    try:
        # Load existing data from the JSON file
        with open(json_file_path, 'r', encoding='utf-8') as file:
            data_list = json.load(file)
    except FileNotFoundError:
        # If the file does not exist, start with an empty list
        data_list = []

    # Append the new data and label to the list
    data_list.append({ "prompt": label, "ans": data})

    # Write the updated list back to the JSON file
    with open(json_file_path, 'w', encoding='utf-8') as file:
        json.dump(data_list, file, ensure_ascii=False, indent=4)

    # Append the new data and label to the data file
    with open(data_file_path, 'a', encoding='utf-8') as file:
        file.write(f"{data}\n{label}\n\n")

def process_file(file_path, json_file_path, data_file_path):
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            content = file.read()

        
        paragraphs = content.split("\n\n")
        start_time = time.time()
        for i, s in enumerate(paragraphs):
            iteration_start_time = time.time()
            if i > 0:
                print(i)
                label = label_data(s)
                append_to_json_and_file(s, label, json_file_path, data_file_path)
                # Calculate the delay needed to achieve approximately 3 iterations per minute      
                delay_seconds = 60 / 3 - (time.time() - start_time) % (60 / 3)
                if delay_seconds > 0:
                    time.sleep(delay_seconds)
            iteration_end_time = time.time()    
            processing_time = iteration_end_time - iteration_start_time
            print(f"Processing time for iteration {i}: {processing_time:.2f} seconds")
            
            
            
                    
            
            
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")

def process_files(file_paths, json_file_path, data_file_path):
    for file_path in file_paths:
        process_file(file_path, json_file_path, data_file_path)

# Example usage
file_paths =   ["cleaned_14-19.txt"]
json_file_path = "14-19.json"
data_file_path = "14-19.txt"

process_files(file_paths, json_file_path, data_file_path)

In [5]:
import spacy
import pandas as pd

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Add pytextrank to the pipeline
nlp.add_pipe("textrank")

def generate_prompts(texts):
    data = {
        'input': [],
        'output': []
    }

    for text in texts:
        doc = nlp(text)
        # Extract key phrases; adjust the limit as needed
        key_phrases = ', '.join([phrase.text for phrase in doc._.phrases[:3]])
        prompt = f"Write a Paragraph {key_phrases}"
        data['input'].append(prompt)
        data['output'].append(text)

    return pd.DataFrame(data)

with open('output.txt', 'r', encoding='utf-8') as file:
    paragraphs = [para.strip() for para in file.readlines() if para.strip()]
    
df = generate_prompts(paragraphs)
print(df.head())
# Optionally, save to a file
df.to_csv('training_data.csv', index=False)


d:\FineTune\venv\Lib\site-packages
                                               input  \
0  Write a Paragraph road safety, travel comfort,...   
1  Write a Paragraph adversarial ML attacks, ML, ...   
2                      Write a Paragraph Index Terms   
3  Write a Paragraph conventional connected vehic...   
4  Write a Paragraph connected vehicles, vehicula...   

                                              output  
0  Abstract—Connected and autonomous vehicles (CA...  
1  Such a transformation—which will be fuelled by...  
2  Index Terms—Connected and autonomous vehicles,...  
3  I. INTRODUCTION In recent years, connected and...  
4  The phenomenon of connected vehicles is realiz...  


In [6]:
import csv
import json

def csv_to_json(csv_file, json_file):
    data = []
    with open(csv_file, 'r', encoding="utf-8") as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            input_text = row['input']
            output_text = row['output']
            data.append({
                'instruction': input_text,
                'query': '',
                'prompt': output_text
            })
    
    with open(json_file, 'w') as jsonfile:
        json.dump(data, jsonfile, indent=4)

csv_file = "training_data.csv"
json_file = "output.json"
csv_to_json(csv_file, json_file)
