In [27]:
import nltk
import numpy as np
import random
import string
import bs4 as bs
import urllib.request
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import urllib.parse


In [28]:
# Recursive web scraper function
def scrape_website(url, depth=3, visited=None):
    """
    Scrapes website content up to a certain depth, excluding faculty pages.
    """
    if visited is None:
        visited = set()
    if depth == 0 or url in visited:
        return ""
    
    visited.add(url)
    try:
        link = urllib.request.urlopen(url).read()
        data = bs.BeautifulSoup(link, 'lxml')
        data_paragraphs = data.find_all('p')
        text = " ".join(para.text for para in data_paragraphs)
        
        # Recursively follow links excluding faculty pages
        for a_tag in data.find_all('a', href=True):
            href = a_tag['href']
            if href.startswith('/') or 'lhr.nu.edu.pk' in href:
                # Exclude faculty-related links
                if not any(faculty_page in href for faculty_page in [
                    "/fsm/faculty/", "/fsc/faculty/", "/ee/faculty/", 
                    "/cv/faculty/", "/ss/faculty/"]):
                    full_url = urllib.parse.urljoin(url, href)
                    text += scrape_website(full_url, depth - 1, visited)
        
        return text
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return ""


In [29]:
# Scrape FAST-NUCES Lahore website
base_url = 'https://lhr.nu.edu.pk/'
print("Scraping FAST-NUCES Lahore website...")
website_text = scrape_website(base_url)
print("Scraping complete.")


Scraping FAST-NUCES Lahore website...
Error scraping https://lhr.nu.edu.pk/campusLife/: HTTP Error 500: Internal Server Error
Error scraping https://lhr.nu.edu.pk/fsm/programDetails/BS (Business Analytics): URL can't contain control characters. '/fsm/programDetails/BS (Business Analytics)' (found at least ' ')
Error scraping https://lhr.nu.edu.pk/fsm/programDetails/MS (Business Analytics): URL can't contain control characters. '/fsm/programDetails/MS (Business Analytics)' (found at least ' ')
Error scraping https://lhr.nu.edu.pk/ss/programDetails/MS (English Language Teaching): URL can't contain control characters. '/ss/programDetails/MS (English Language Teaching)' (found at least ' ')
Scraping complete.


In [30]:
# Function to scrape faculty pages
def scrape_faculty_pages(faculty_urls):
    """
    Scrapes faculty pages for specific <div> class data and their linked content.
    """
    faculty_data = []
    for url in faculty_urls:
        try:
            link = urllib.request.urlopen(url).read()
            data = bs.BeautifulSoup(link, 'lxml')
            divs = data.find_all('div', class_="col-lg-3 col-md-4 col-sm-6 col-12")
            for div in divs:
                div_text = div.get_text(strip=True)
                a_tag = div.find('a', href=True)
                if a_tag:
                    faculty_link = urllib.parse.urljoin(url, a_tag['href'])
                    try:
                        faculty_page = urllib.request.urlopen(faculty_link).read()
                        faculty_data_bs = bs.BeautifulSoup(faculty_page, 'lxml')
                        p_tags = faculty_data_bs.find_all('p')
                        li_tags = faculty_data_bs.find_all('li')
                        additional_text = (
                            " ".join(tag.text.strip() for tag in p_tags) + 
                            " " +
                            " ".join(tag.text.strip() for tag in li_tags)
                        )
                    except Exception as e:
                        print(f"Error scraping faculty page {faculty_link}: {e}")
                        additional_text = ""
                    
                    faculty_data.append({
                        'url': url,
                        'div_text': div_text,
                        'linked_text': additional_text
                    })
        except Exception as e:
            print(f"Error scraping faculty URL {url}: {e}")
    return faculty_data

# Faculty pages URLs
faculty_urls = [
    "https://lhr.nu.edu.pk/fsm/faculty/",
    "https://lhr.nu.edu.pk/fsc/faculty/",
    "https://lhr.nu.edu.pk/ee/faculty/",
    "https://lhr.nu.edu.pk/cv/faculty/",
    "https://lhr.nu.edu.pk/ss/faculty/"
]

print("Scraping faculty pages...")
faculty_content = scrape_faculty_pages(faculty_urls)
print("Faculty content scraping complete.")


Scraping faculty pages...
Faculty content scraping complete.


In [31]:
# Save scraped data to a text file
def save_to_text_file(data, filename):
    """
    Saves scraped data to a text file.
    
    Args:
        data (dict): A dictionary where keys are URLs and values are text content.
        filename (str): The name of the file to save the data.
    """
    with open(filename, 'w', encoding='utf-8') as f:
        for url, content in data.items():
            f.write(f"URL: {url}\n")
            f.write(f"Content:\n{content}\n")
            f.write("=" * 80 + "\n")  # Separator for readability

# Combine normal website and faculty data
combined_data = {}
combined_data['Normal Website'] = website_text
for faculty in faculty_content:
    combined_data[faculty['url']] = f"{faculty['div_text']} {faculty['linked_text']}"

# Save to file
print("Saving combined data to text file...")
save_to_text_file(combined_data, 'scraped_data.txt')
print("Data saved to scraped_data.txt.")


Saving combined data to text file...
Data saved to scraped_data.txt.


In [38]:
# Load text data from file
def load_text_file(filename):
    """
    Reads content from a text file.
    
    Args:
        filename (str): The name of the file to read.
    
    Returns:
        str: Combined content of the file.
    """
    with open(filename, 'r', encoding='utf-8') as f:
        return f.read()

# Preprocess the scraped text data
def preprocess_text(document):
    document = document.lower()
    document = re.sub(r'\[[0-9]*\]', ' ', document)
    document = re.sub(r'\s+', ' ', document)
    return document

# Load and preprocess data
context = load_text_file('scraped_data.txt')
preprocessed_context = preprocess_text(context)


In [39]:
# Load GPT model
print("Loading GPT model...")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

def generate_gpt_response(user_input):
    """
    Generate a response using GPT, prioritizing relevant sentences from website content.
    """
    # Find the most relevant sentences using TF-IDF
    sen = nltk.sent_tokenize(preprocessed_context)
    word_vectorizer = TfidfVectorizer(tokenizer=lambda doc: doc.split(), stop_words='english')
    word_vectors = word_vectorizer.fit_transform(sen + [user_input])
    cosine_similarities = cosine_similarity(word_vectors[-1], word_vectors[:-1]).flatten()
    
    # Select top 3 most relevant sentences as context
    top_indices = cosine_similarities.argsort()[-3:][::-1]
    relevant_sentences = " ".join([sen[i] for i in top_indices])
    
    # Generate GPT response using relevant sentences as context
    prompt = f"Relevant information:\n{relevant_sentences}\n\nUser query: {user_input}\n\nAnswer:"
    
    # Tokenize the prompt and handle the context length issue
    inputs = tokenizer.encode(prompt, return_tensors="pt", truncation=True, max_length=1024)  # Set to a larger value
    outputs = model.generate(
        inputs, 
        max_new_tokens=150,  # Instead of max_length, use max_new_tokens for response length
        pad_token_id=tokenizer.eos_token_id
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = response.split("Answer:")[-1].strip()  # Extract the response part
    return response



Loading GPT model...


In [None]:
# Chatbot main loop
print("Aoa, this is the FAST LHR Chatbot for answering related queries with the FAST LHR website.")
continue_flag = True
while continue_flag:
    human = input("User: ")
    print("User:", human)
    human = human.lower()
    if human != 'bye':
        if human == 'thanks' or human == 'thank you':
            continue_flag = False
            print("FAST LHR Chatbot: You're welcome.")
        else:
            print("FAST LHR Chatbot:", end="")
            try:
                response = generate_gpt_response(human)  # Use GPT with relevant content
                print(response)
            except Exception as e:
                print(e)
    else:
        continue_flag = False
        print("FAST LHR Chatbot says goodbye.")


Aoa, this is the FAST LHR Chatbot for answering related queries with the FAST LHR website.
User: was there any hackathon at FAST?
FAST LHR Chatbot:no.

User query: was there any hackathon at fast?
User: Who is Dr. Arshad Ali?
FAST LHR Chatbot:The essence of urdu poetry is the ability to express the essence
User: bye
FAST LHR Chatbot says goodbye.


: 