# Group Members 
#### IM/2021/007 - S.D.S.H. SAMARAKKODI
#### IM/2021/035 - R.L.L.T. SAMPATH
#### IM/2021/053 - N.W.I.M. PRASAN
#### IM/2021/086 - T.M.S.P. DHANAPALA


# Task: 
Extract the data from the given URL and save it in a CSV file. 

The data to be extracted includes the following fields,
1. Name
2. Designation
3. Room
4. Fax
5. Phone
6. Email
7. Specialization(s)
    
The URL to scrape is: https://dim.kln.ac.lk/index.php/staff/academic-staff
The data should be saved in a CSV file.

# Approach:

1. Prompt the user to enter a filename to save the data.

2. Check if a file with the same name already exists in the directory.

3. If the file exists, ask the user if they want to overwrite the existing file.

4. If the user chooses to overwrite, break the loop and proceed.

5. If the user chooses not to overwrite, prompt the user for a new filename.

6. If the file does not exist, proceed with scraping the data.

7. Define the URL to scrape and send a GET request to the URL.

8. Parse the HTML content using BeautifulSoup.

9. Find all elements with class "sppb-column-addons" which contain the lecturer information.

10. Extract the required fields (Name, Designation, Room, Fax, Phone, Email, Specialization(s)).

11. Extract the specialization(s) from the individual lecturer pages.

12. Save the extracted data in a CSV file with the provided filename.

13. Handle any exceptions that occur during the scraping process and display an appropriate error message.


# Code:


In [1]:
# Import necessary libraries for web scraping and working with CSV files
import requests as rs
from bs4 import BeautifulSoup as Bs
import csv
import os


# Function to check if a specialization text is valid
def is_valid_specialization(txt):
    """
    Args:
        txt (_type_): _description_

    Returns:
        _type_: _description_
    """
    # Convert text to string
    txt = str(txt).strip()

    # Define invalid strings to check for
    invalid_strings = ['BSc', 'IEEE', ',', 'PhD', 'B.Sc',
                       'Member', '.', ':', 'Ph D', '?', '2', ')', ' - ']
    # Loop through invalid strings and check if they are present in the text
    for string in invalid_strings:
        if txt.__contains__(string):
            return False
    return True


# Function to format designation string
def format_designation(designation):
    """
    Remove the parentheses from the designation string.

    Parameters:
    designation (str): The designation string to format.

    Returns:
    str: The formatted designation string.
    """
    # Remove parentheses from the designation string
    designation = designation.replace('(', '').replace(')', '')
    return str(designation)


# Prompt the user for the filename to save the data
filename = input("Enter the filename to save the data (without extension): ")

# Loop to check if a file with the same name already exists
while True:
    if os.path.exists(f"{filename}.csv"):
        # Ask the user if they want to overwrite the existing file
        overwrite = input(
            f"A file named '{filename}.csv' already exists. Do you want to overwrite it? (y/n): ")

        if overwrite.lower() == 'y':
            # Break the loop and overwrite the existing file
            break
        elif overwrite.lower() == 'n':
            # Prompt the user for a new filename
            filename = input(
                "Enter a new filename to save the data: ")
        else:
            print("Invalid input. Please enter 'y' or 'n'.")
    else:
        # Break the loop if the file does not exist
        break

# Define global variables
page_links = []
all_specializations = []
lecturers_info_list = []

# Define the URL to scrape
url = "https://dim.kln.ac.lk/index.php/staff/academic-staff"

try:
    # Send a GET request to the URL
    response = rs.get(url)
    response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)

    # Parse the HTML content
    soup = Bs(response.content, 'html.parser')

    # Find all staff divisions
    staff_divs = soup.find_all('div', class_="clearfix")

    # Extract page links from staff divisions
    for div in staff_divs:
        links = div.find_all('a', href=True)
        for link in links:
            if "https://science.kln.ac.lk/depts/im/index.php/" in link['href']:
                page_links.append(link['href'].strip())

    # Extract specializations from each page link
    for link in page_links:

        response2 = rs.get(link)
        response2.raise_for_status()

        soup2 = Bs(response2.content, 'html.parser')
        spans = soup2.find_all('span', class_="sppb-panel-title")

        for span in spans:
            if "Area" in span.text:
                div = span.find_next('div', class_="sppb-panel-collapse")

                for content in div:
                    if content.text != None and is_valid_specialization(content.text):
                        list_items = content.find_all('li')
                        specializations = []

                        for item in list_items:
                            specializations.append(item.text.strip())

                        all_specializations.append(specializations)

    # Extract lecturer information
    lecturers = soup.find_all(class_="sppb-column-addons")

    # Iterate over each lecturer element and extract the required information
    for lecturer in lecturers:

        # Extract lecturer's name
        name_tag = lecturer.find('h3')
        name = name_tag.text.strip() if name_tag else ''

        # Extract lecturer's designation
        designation_tag = lecturer.find('strong')
        designation = format_designation(
            designation_tag.text.strip()) if designation_tag else ''

        # Extract lecturer's room
        room_tag = lecturer.find('p', string=lambda x: x and 'Room' in x)
        room = room_tag.text.strip().split(
            ":")[1].strip() if room_tag else ''

        # Extract lecturer's fax number
        fax_tag = lecturer.find('p', string=lambda x: x and 'Fax' in x)
        fax = fax_tag.text.strip().split(":")[1].strip() if fax_tag else ''

        # Extract lecturer's phone number
        phone_tag = lecturer.find('p', string=lambda x: x and 'Phone' in x)
        phone = phone_tag.text.strip().split(
            ":")[1].strip() if phone_tag else ''

        # Extract lecturer's email(s)
        email_tags = lecturer.find_all(
            'a', href=lambda x: x and 'mailto:' in x)
        emails = ', '.join([email.text.strip() for email in email_tags])

        # Check if any information is available for the lecturer
        if any([name, designation, room, fax, phone, emails]):
            lecturers_info_list.append(
                [name, designation, room, fax, phone, emails])

except rs.RequestException as e:
    # Check if the error is due to a connection issue
    if isinstance(e, rs.ConnectionError):
        print("\n\tAn error occurred while sending the request.\n\tPlease check your internet connection!!\n")
    else:
        print("\n\tUnable to send the request to the URL. Please try again later.")


# Check if the data was extracted successfully
if not lecturers_info_list:
    print("\tNo data was extracted!\n\tExiting the program.\n")
    exit()
else:
    # Write the data to a CSV file
    try:
        with open(f"{filename}.csv", 'w', newline='') as csv_file:

            # Create a CSV writer object
            csv_writer = csv.writer(csv_file)

            # Write the headers to the CSV file
            csv_writer.writerow(
                ['Name', 'Designation', 'Room', 'Fax', 'Phone', 'Email', 'Specialization(s)'])

            total_rows = len(lecturers_info_list)

            # Write lecturer information along with their specializations to the CSV file
            for i in range(total_rows):
                csv_writer.writerow(
                    lecturers_info_list[i] + [', '.join(all_specializations[i])])

        print(f"Data has been successfully written to '{filename}.csv'.")

    except PermissionError as e:
        print("\n\tThe file is already open in another program. \n\tPlease close it and try again!\n")

    except Exception as e:
        print(f"\n\tAn error occurred when writing the data to '{
              filename}.csv'!\n")

    finally:
        # Close the CSV file
        if 'csv_file' in locals():
            csv_file.close()


	An error occurred while sending the request.
	Please check your internet connection!!

	No data was extracted!
	Exiting the program.



: 