In [None]:
import nltk
#import numpy as np
import random
import string
import bs4 as bs
import urllib.request
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
#from transformers import pipeline
from transformers import GPT2Tokenizer, GPT2LMHeadModel

In [None]:
# Scrapping the website
def scrape_website(url, depth=2, visited=None):
    if visited is None:
        visited = set()
    if depth == 0 or url in visited:
        return ""
    
    visited.add(url)
    try:
        link = urllib.request.urlopen(url).read()
        data = bs.BeautifulSoup(link, 'lxml')
        data_paragraphs = data.find_all('p')
        text = " ".join(para.text for para in data_paragraphs)
        
        
        for a_tag in data.find_all('a', href=True):
            href = a_tag['href']
            if href.startswith('/') or 'lhr.nu.edu.pk' in href:
                full_url = urllib.parse.urljoin(url, href)
                text += scrape_website(full_url, depth - 1, visited)
        
        return text
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return ""


In [None]:
# Scrapping FAST-NUCES Lahore website
base_url = 'https://lhr.nu.edu.pk/'
print("Scraping FAST-NUCES Lahore website...")
website_text = scrape_website(base_url)
print("Scraping complete.")

Scraping FAST-NUCES Lahore website...
Error scraping https://lhr.nu.edu.pk/campusLife/: HTTP Error 500: Internal Server Error
Error scraping https://lhr.nu.edu.pk/fsm/programDetails/BS (Business Analytics): URL can't contain control characters. '/fsm/programDetails/BS (Business Analytics)' (found at least ' ')
Error scraping https://lhr.nu.edu.pk/fsm/programDetails/MS (Business Analytics): URL can't contain control characters. '/fsm/programDetails/MS (Business Analytics)' (found at least ' ')
Error scraping https://lhr.nu.edu.pk/ss/programDetails/MS (English Language Teaching): URL can't contain control characters. '/ss/programDetails/MS (English Language Teaching)' (found at least ' ')
Scraping complete.


In [4]:
website_text

'\n\n            \t\t    \t\t\t\n\t\t\t      Bachelor of Business Administration\n                             \n \n\n\t\t\t    \t\t\t\n\t\t\t      BS (Business Analytics)\n                             \n \n\n            \t\t    \t\t\t\n\t\t\t      BS (Accounting & Finance)\n                             \n \n\n            \t\t    \t\t\t\n\t\t\t      MS (Accounting & Finance)\n                             \n \n\n            \t\t    \t\t\t\n\t\t\t      Master of Business Administration\n                             \n \n\n\t\t\t    \t\t\t\n\t\t\t      MS (Business Analytics)\n                             \n \n\n            \t\t    \t\t\t\n\t\t\t      PhD (Management Sciences)\n                             \n \n\n            \t\t    \t\t\t\n\t\t\t      BS (Computer Science)\n                             \n \n\n            \t\t    \t\t\t\n\t\t\t      BS (Data Science)\n                             \n \n\n            \t\t    \t\t\t\n\t\t\t      BS (Software Engineering)\n                   

In [5]:
# Preprocess text
print("Processing scraped data...")
website_text = website_text.lower()
website_text = re.sub(r'\[[0-9]*\]', ' ', website_text)
website_text = re.sub(r'\s+', ' ', website_text)

Processing scraped data...


In [6]:
# Tokenization
sen = nltk.sent_tokenize(website_text)
words = nltk.word_tokenize(website_text)

In [None]:
words

NameError: name 'words' is not defined

: 

In [None]:
# Lemmatization is done here
wnlem = nltk.stem.WordNetLemmatizer()
def perform_lemmatization(tokens):
    return [wnlem.lemmatize(token) for token in tokens]

# Preprocessing is done here
pr = dict((ord(punctuation), None) for punctuation in string.punctuation)
def get_processed_text(document):
    return perform_lemmatization(nltk.word_tokenize(document.lower().translate(pr)))

# Greeting function for the chatbot
greeting_inputs = ("hey", "hello", "aoa user")
greeting_responses = ["Hey!", "Wasalam!"]
def generate_greeting_response(greeting):
    for token in greeting.split():
        if token.lower() in greeting_inputs:
            return random.choice(greeting_responses)


In [None]:
# Fallback which is tf-idf
def generate_tfidf_response(user_input):
    bot_response = ''
    sen.append(user_input)
    word_vectorizer = TfidfVectorizer(tokenizer=get_processed_text, stop_words='english')
    word_vectors = word_vectorizer.fit_transform(sen)
    similar_vector_values = cosine_similarity(word_vectors[-1], word_vectors)
    similar_sentence_number = similar_vector_values.argsort()[0][-2]
    matched_vector = similar_vector_values.flatten()
    matched_vector.sort()
    vector_matched = matched_vector[-2]

    if vector_matched == 0:
        bot_response = bot_response + 'Sorry, I could not understand your query.'
    else:
        bot_response = bot_response + sen[similar_sentence_number]
    sen.pop(-1)  
    return bot_response


print("Loading GPT model...")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")


def generate_gpt_response(user_input):
    """
    Generate a response using GPT, prioritizing relevant sentences from website content.
    """
    
    word_vectorizer = TfidfVectorizer(tokenizer=get_processed_text, stop_words='english')
    word_vectors = word_vectorizer.fit_transform(sen + [user_input])
    cosine_similarities = cosine_similarity(word_vectors[-1], word_vectors[:-1]).flatten()
    
    
    top_indices = cosine_similarities.argsort()[-3:][::-1]
    relevant_sentences = " ".join([sen[i] for i in top_indices])
    
    
    prompt = f"Relevant information:\n{relevant_sentences}\n\nUser query: {user_input}\n\nAnswer:"
    inputs = tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(
        inputs, max_length=150, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = response.split("Answer:")[-1].strip()  
    return response

# Chatbot 
continue_flag = True
print("Aoa, this is the FAST LHR Chatbot for answering related queries with the FAST LHR website.")
while continue_flag:
    human = input("User: ")
    print("User:", human)
    human = human.lower()
    if human != 'bye':
        if human == 'thanks' or human == 'thank you':
            continue_flag = False
            print("FAST LHR Chatbot: You're welcome.")
        else:
            if generate_greeting_response(human) is not None:
                print("FAST LHR Chatbot: " + generate_greeting_response(human))
            else:
                print("FAST LHR Chatbot:", end="")
                try:
                    response = generate_gpt_response(human)  
                    print(response)
                except Exception as e:
                    print("Error using GPT. Falling back to TF-IDF.")
                    print(generate_tfidf_response(human))  
    else:
        continue_flag = False
        print("FAST LHR Chatbot says goodbye.")

Loading GPT model...
Aoa, this is the FAST LHR Chatbot for answering related queries with the FAST LHR website.
User: Who Is Dr. Kashif Zafar
FAST LHR Chatbot:Dr. kashif zafar is a professor of computer science at fast nuces. He is also the director of the computer science department at fast nuces.

Dr. kashif zafar is also the director of the computer science department at fast nuces.

Dr. kashif zafar is also the director of the computer science department at fast nu
User: Who is Dr. Hajra Waheed
FAST LHR Chatbot:Error using GPT. Falling back to TF-IDF.
waheed iqbal visited fast nuces lahore on 17th february and conducted a session on predictive auto-scaling of multi-tier applications using performance varying cloud resources.
User: Who is Dr. Arshad Ali
FAST LHR Chatbot:Dr. arshad ali is a member of the board of governors, along with the campus director (dr hammad naveed), heads of departments (hods), and subject experts, attended the meeting.

User query: who


: 

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
import torch
from datasets import Dataset


def preprocess_for_finetuning(data):
    text_data = ' '.join(data.split('. ')) 
    return text_data


scraped_data = website_text 
processed_data = preprocess_for_finetuning(scraped_data)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
inputs = tokenizer(processed_data, return_tensors="pt", max_length=512, truncation=True, padding=True)
dataset = Dataset.from_dict({"input_ids": inputs["input_ids"]})
model = GPT2LMHeadModel.from_pretrained("gpt2")


training_args = TrainingArguments(
    output_dir='./results',         
    num_train_epochs=1,              
    per_device_train_batch_size=1,  
    save_steps=10_000,              
    save_total_limit=2,              
)


trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=dataset                
)

# Training model
trainer.train()


In [None]:
from nltk.translate.bleu_score import sentence_bleu
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Manual testing for the chatbot
manual_test_queries = [
    "What programs are offered at FAST LHR?",
    "Tell me about the admissions process.",
    "What is the campus like?",
    "Who is the faculty at FAST LHR?",
    "What societies are available?",
    "What is the eligibility for admission?",
    "How do I apply for a scholarship?",
    "What is the fee structure?",
    "What are the labs available for CS students?",
    "Where is the university located?",
    "What is the university's ranking?",
    "What is the student culture at FAST LHR?",
    "How can I get in touch with faculty members?",
    "What are the graduation requirements?",
    "How many students are enrolled at FAST LHR?",
    "What is the process for international students?",
    "What are the hostel facilities like?",
    "How is the placement process at FAST LHR?",
    "Are there any internship opportunities?",
    "Can I change my program after admission?"
]


def generate_response_from_model(query):
    
    inputs = tokenizer.encode(query, return_tensors="pt")
    
    
    outputs = model.generate(inputs, max_length=150, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return response.strip()

# Evaluate BLEU score and cosine similarity
def evaluate_model(test_queries, expected_responses):
    bleu_scores = []
    cosine_similarities = []

    for query, expected_response in zip(test_queries, expected_responses):
        
        model_response = generate_response_from_model(query)
        
       
        bleu = sentence_bleu([expected_response.split()], model_response.split())
        bleu_scores.append(bleu)
        
        
        cosine_sim = cosine_similarity(
            np.array([get_processed_text(expected_response)]), 
            np.array([get_processed_text(model_response)])
        )[0][0]
        cosine_similarities.append(cosine_sim)
        
       
        print(f"Query: {query}")
        print(f"Expected Response: {expected_response}")
        print(f"Model Response: {model_response}")
        print(f"BLEU Score: {bleu}")
        print(f"Cosine Similarity: {cosine_sim}\n")
    
    
    avg_bleu = np.mean(bleu_scores)
    avg_cosine_sim = np.mean(cosine_similarities)
    
    print(f"Average BLEU Score: {avg_bleu}")
    print(f"Average Cosine Similarity: {avg_cosine_sim}")
    

expected_responses = [
    "The programs offered at FAST LHR include undergraduate and postgraduate degrees in engineering, business, and computer science.",
    "The admissions process includes an online application followed by an interview and test for eligible candidates.",
    "The campus at FAST LHR has state-of-the-art facilities including libraries, sports complexes, and modern classrooms.",
    "FAST LHR has highly qualified faculty members in various departments including engineering, business, and computer science.",
    "FAST LHR offers several student-run societies for extracurricular activities including sports, technology, and culture.",
    "The eligibility for admission varies depending on the program and includes academic qualifications and entrance test scores.",
    "Students can apply for scholarships based on merit or need through the university's financial aid office.",
    "The fee structure is determined based on the program and semester. Scholarships and financial aid options are available.",
    "CS students have access to advanced labs including AI, machine learning, and software development.",
    "The university is located in Lahore, Pakistan, with campuses in other cities as well.",
    "FAST LHR is ranked as one of the top universities in Pakistan, particularly in engineering and business.",
    "The student culture at FAST LHR is vibrant with numerous events, clubs, and activities throughout the year.",
    "Students can contact faculty members through the university's email system or in person during office hours.",
    "Graduation requirements include completion of coursework, internships, and a final project or thesis.",
    "There are over 10,000 students enrolled in various programs at FAST LHR.",
    "International students must meet specific visa requirements and undergo an admissions process similar to local students.",
    "There are no hostels in FAST LHR. However, there are many hostels near the campus so the students can choose to reside in them if they wish for their convenience.",
    "The placement process includes career fairs, interviews, and company visits to help students find suitable employment.",
    "Internship opportunities are provided through university partnerships with leading companies.",
    "Program changes are allowed based on academic performance and subject to approval by the administration."
]


evaluate_model(manual_test_queries, expected_responses)
