## **LLM FINE TUNNNING ON PERSONAL DATA SET**

### **Part I: Webscrapper**

In [None]:
# Imports for WebScrapper
import os
import requests
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from urllib.parse import urljoin

 - **Web Scrapper to extract the links to Articles**

In [None]:
# Set up Selenium WebDriver
driver = webdriver.Chrome()
# Target URL
driver.get("https://scholar.google.com/citations?user=unGWVYMAAAAJ&hl=en")

# Wait for the search results to load
WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, "//*[@class='gsc_a_t']/a")))

# Define a function to extract PDF links from a given page
def extract_pdf_links(page_source):
    soup = BeautifulSoup(page_source, 'html.parser')
    all_links = soup.find_all('a')
    pdf_links = [pdf_link['href'] for pdf_link in all_links if pdf_link.get('href') and ('.pdf' in pdf_link['href'] or 'pdf' in pdf_link['href'].lower())]
    return pdf_links

# Scroll to the end of page to load all results
show_more_button = driver.find_element(By.XPATH, "//*[@id='gsc_bpf_more']//*[@class='gs_lbl']")
show_more_button.click()
time.sleep(2)

# Find all search result links
search_result_links = driver.find_elements(By.XPATH, "//*[@class='gsc_a_t']/a")

# List to store all PDF links
all_pdf_links = []

# Loop through each link
for link in search_result_links:
    # Get the href attribute of the link
    href = link.get_attribute("href")
    # Store the link URL before clicking
    link_url = href
    # Open the link in a new tab using JavaScript to avoid navigation
    driver.execute_script("window.open('" + link_url + "');")
    # Switch to the newly opened tab
    driver.switch_to.window(driver.window_handles[-1])
    # Wait for the page to load
    WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, "//*")))
    # Get the page source
    page_source = driver.page_source
    # Extract PDF links from the page source
    pdf_links = extract_pdf_links(page_source)
    # Print PDF links
    if pdf_links:
        print("PDF links on", href, ":", pdf_links)
        all_pdf_links.extend(pdf_links)
        
        # Remove dulicate links
        all_pdf_links = list(set(all_pdf_links))
        
    else:
        print("No PDF links found on", href)
    # Close the current tab
    driver.close()
    # Switch back to the main tab
    driver.switch_to.window(driver.window_handles[0])

# Close the browser
driver.quit()

- **Downloading and Renaming the Articles**

In [None]:
# write links to file
with open('pdf_links.txt', 'w') as f:
    for pdf_link in all_pdf_links:
        f.write(pdf_link + '\n')
        
# Create folder for files
folder_path = "pdf_folder"
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# read links from file
with open('pdf_links.txt', 'r') as f:
    all_pdf_links = f.read().splitlines()
    
# Initialize counter
file_counter = 1

# Download PDF files
for pdf_link in all_pdf_links:
    if pdf_link.startswith('http'):
        pdf_url = pdf_link
    else:
        pdf_url = urljoin(href, pdf_link)
    
    # Download PDF file
    response = requests.get(pdf_url)
    
    # Ensure the response is successful
    if response.status_code == 200:
        # Determine file name
        file_name = os.path.join(folder_path, f"{file_counter}.pdf")
        
        # Save the PDF file
        with open(file_name, 'wb') as f:
            f.write(response.content)
        
        print("Downloaded:", file_name)
        
        # Increment counter for the next file
        file_counter += 1
    else:
        print(f"Failed to download {pdf_url}. Status code: {response.status_code}")

### **----------------------------------------------------------------------------**

### **Part II: Data Preprocessing**

In [None]:
# Imports for PDF to Text
import os
import fitz
import re

In [None]:
import fitz  # PyMuPDF
import os
import re

def clean_text(text):
    """Clean text by removing unwanted lines and references."""
    text = re.sub(r'\[(.*?)\]', '', text)  # Remove reference notations
    lines = text.split('\n')
    return [line for line in lines if not line.lower().startswith(('fig', 'fig.', 'figure', 'table'))]

def join_lines(lines):
    """Join lines properly to maintain paragraph integrity."""
    filtered_lines = []
    prev_line = ""
    for line in lines:
        if prev_line and not prev_line.endswith('.') and not line.startswith((' ', '\t')):
            filtered_lines[-1] += ' ' + line
        else:
            filtered_lines.append(line)
        prev_line = line
    return '\n\n'.join(filtered_lines)  # Double newline to signify paragraph end

def extract_text_from_pdf(pdf_file, output_txt):
    text_between_sections = ""
    abstract_started = False
    references_started = False
    try:
        with fitz.open(pdf_file) as doc:
            for page in doc:
                for table in page.find_tables():
                    page.add_redact_annot(table.bbox)
                page.apply_redactions()
                text = page.get_text("text")

                if "abstract" in text.lower() and not abstract_started:
                    abstract_started = True
                    text = text[(text.lower()).index("abstract"):]

                if "REFERENCES" in text and abstract_started:
                    references_started = True
                    text = text[:text.index("REFERENCES")]

                if abstract_started:
                    cleaned_lines = clean_text(text)
                    text_between_sections += join_lines(cleaned_lines)

                if abstract_started and references_started:
                    break

        with open(output_txt, 'w', encoding='utf-8') as txt_file:
            txt_file.write(text_between_sections)
    except Exception as e:
        logging.error(f"Failed to process {pdf_file}: {str(e)}")

def process_pdf_folder(folder_path):
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, filename)
            output_txt = os.path.splitext(pdf_path)[0] + ".txt" 
            extract_text_from_pdf(pdf_path, output_txt)
            logging.info(f"Processed {filename}")

process_pdf_folder("Pdf_folder")

### **Part III: Preparing Dataset**

In [None]:
# Merging Data to get one file
import os

def load_and_concatenate_txt(directory):
    all_texts = []
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
                all_texts.append(text.strip())
    return '\n\n'.join(all_texts)  # Join all articles with double newlines

# Specify the directory where your text files are stored
directory = 'pdf_folder/TextFiles'
full_corpus = load_and_concatenate_txt(directory)

# Save the full corpus to a text file
output_file = 'output.txt'
with open(output_file, 'w', encoding='utf-8') as file:
    file.write(full_corpus)

In [5]:
import spacy
import pytextrank
import pandas as pd

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Add pytextrank to the pipeline
nlp.add_pipe("textrank")

def generate_prompts(texts):
    data = {
        'input': [],
        'output': []
    }

    for text in texts:
        doc = nlp(text)
        # Extract key phrases; adjust the limit as needed
        key_phrases = ', '.join([phrase.text for phrase in doc._.phrases[:3]])
        prompt = f"Write a Paragraph {key_phrases}"
        data['input'].append(prompt)
        data['output'].append(text)

    return pd.DataFrame(data)

with open('output.txt', 'r', encoding='utf-8') as file:
    paragraphs = [para.strip() for para in file.readlines() if para.strip()]
    
df = generate_prompts(paragraphs)
print(df.head())
# Optionally, save to a file
df.to_csv('training_data.csv', index=False)


d:\FineTune\venv\Lib\site-packages
                                               input  \
0  Write a Paragraph road safety, travel comfort,...   
1  Write a Paragraph adversarial ML attacks, ML, ...   
2                      Write a Paragraph Index Terms   
3  Write a Paragraph conventional connected vehic...   
4  Write a Paragraph connected vehicles, vehicula...   

                                              output  
0  Abstract—Connected and autonomous vehicles (CA...  
1  Such a transformation—which will be fuelled by...  
2  Index Terms—Connected and autonomous vehicles,...  
3  I. INTRODUCTION In recent years, connected and...  
4  The phenomenon of connected vehicles is realiz...  


In [6]:
import csv
import json

def csv_to_json(csv_file, json_file):
    data = []
    with open(csv_file, 'r', encoding="utf-8") as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            input_text = row['input']
            output_text = row['output']
            data.append({
                'instruction': input_text,
                'query': '',
                'prompt': output_text
            })
    
    with open(json_file, 'w') as jsonfile:
        json.dump(data, jsonfile, indent=4)

csv_file = "training_data.csv"
json_file = "output.json"
csv_to_json(csv_file, json_file)


### **--------------------------------------------------------**

In [None]:
# Imports for Text Analysis
import os
import re
import string
from collections import Counter
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.probability import FreqDist
from nltk import pos_tag

# Download required resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

# Define a function to preprocess text
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove whitespace
    text = text.strip()
    return text

# Define a function to remove stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
    return ' '.join(filtered_text)

# Define a function to perform stemming
def perform_stemming(text):
    ps = PorterStemmer()
    word_tokens = word_tokenize(text)
    stemmed_text = [ps.stem(word) for word in word_tokens]
    return ' '.join(stemmed_text)

# Define a function to plot word frequency
def plot_word_frequency(text, num_words=10):
    fdist = FreqDist(text.split())
    fdist = dict(sorted(fdist.items(), key=lambda x: x[1], reverse=True))
    words, frequencies = list(fdist.keys())[:num_words], list(fdist.values())[:num_words]
    plt.figure(figsize=(10, 5))
    plt.bar(words, frequencies, color='skyblue')
    plt.xlabel('Words')
    plt.ylabel('Frequency')
    plt.title('Word Frequency')
    plt.xticks(rotation=45)
    plt.show()
    
# Define a function to extract keywords
def extract_keywords(text):
    words = word_tokenize(text)
    tagged_words = pos_tag(words)
    keywords = [word for word, tag in tagged_words if tag in ['NN', 'NNS', 'NNP', 'NNPS']]
    return keywords

# Process text files in a folder
def process_text_folder(folder_path):
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                text = file.read()
                processed_text = preprocess_text(text)
                processed_text = remove_stopwords(processed_text)
                processed_text = perform_stemming(processed_text)
                keywords = extract_keywords(processed_text)
                print(f"Keywords in {filename}: {keywords}")
                plot_word_frequency(processed_text)
                
                
process_text_folder("pdf_folder")

In [None]:
import fitz
import re

def extract_text_between_sections(pdf_file):
    text_between_sections = ""
    abstract_started = False
    references_started = False

    with fitz.open(pdf_file) as doc:
        for page_num in range(len(doc)):
            page = doc[page_num]
            

            # Remove table texts from the page text
            for table in page.find_tables():
                page.add_redact_annot(table.bbox)  # wrap table in a redaction annotation

            page.apply_redactions()
            
            text = page.get_text("text")  # Extract text from the page
            # text = text.lower()

            if "abstract" in text.lower() and not abstract_started:
                abstract_started = True
                text = text[(text.lower()).index("abstract"):]

            if "REFERENCES" in text and abstract_started:
                references_started = True
                text = text[:text.index("REFERENCES")]

            if abstract_started:
                # Remove reference mentions (including square brackets and numbers)
                text = re.sub(r'\[(.*?)\]', '', text)

                # Remove lines starting with "fig", "fig.", "figure", or "table"
                lines = text.split('\n')
                filtered_lines = [line for line in lines if not line.lower().startswith(('fig', 'fig.', 'figure', 'table'))]
                text = '\n'.join(filtered_lines)

                text_between_sections += text


            if abstract_started and references_started:
                break
    return text_between_sections

pdf_file = 'pdf_folder/1.pdf'
text_between_sections = extract_text_between_sections(pdf_file)
print(text_between_sections)

# write to file
output_txt = 'output.txt'
with open(output_txt, 'w', encoding='utf-8') as f:
    f.write(text_between_sections)

In [None]:
import fitz

def extract_sections(pdf_file):
    text_between_sections = ""
    abstract_started = False
    references_started = False

    with fitz.open(pdf_file) as doc:
        for page_num in range(len(doc)):
            page = doc[page_num]
            # Remove table texts from the page text
            for table in page.find_tables():
                page.add_redact_annot(table.bbox)  # wrap table in a redaction annotation

            page.apply_redactions()
            
            text_blocks = page.get_text("blocks")

            for block in text_blocks:
                if block[4].strip().isdigit():
                    continue
                elif (block[4].lower()).startswith("abstract"):
                    abstract_started = True
                elif (block[4].lower()).startswith("references"):
                    break
                
                if abstract_started:
                    # remove \n with the block[4] and only keep the last \n
                    # text = block[4].replace('\n', ' ')
                    # text = text.strip()
                    
                    # text_between_sections += text + '\n'
                    print(block[4])

pdf_file = "pdf_folder/1.pdf"
sections = extract_sections(pdf_file)

# for section in sections:
#     print(f"Level {section['level']}: {section['text']}")