In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import re

def scrape_to_csv(url, output_csv_path):
    # Fetch the page
    resp = requests.get(url)
    if resp.status_code != 200:
        raise Exception(f"Failed to fetch page: status code {resp.status_code}")
    
    html = resp.text
    soup = BeautifulSoup(html, 'html.parser')
    
    # Find all <center> tags
    centers = soup.find_all('center')
    print(f"Found {len(centers)} <center> tags.")
    
    csv_file = open(output_csv_path, mode='w', newline='', encoding="utf-8-sig")
    writer = None  # initialized when we see first table
    
    def clean(s):
        return re.sub(r'\s+', ' ', s.strip())

    #check if row contains text
    def row_contains(row_data, text):
        return any(text in str(cell) for cell in row_data)
    
    stop_scraping = False
    row_headers= False
    for center in centers:
        if stop_scraping:
            break
        tables = center.find_all('table')
        for tbl_index, table in enumerate(tables):
            if stop_scraping:
                break
            rows = table.find_all('tr')
            for row_index, tr in enumerate(rows):
                cols = tr.find_all(['td', 'th'])
                row_data = [clean(col.get_text()) for col in cols]
                
                if not row_data:
                    continue
                
                # Stop condition
                if row_contains(row_data, "طباعة"):
                    print("Encountered 'طباعة' – stopping scrape.")
                    stop_scraping = True
                    break
              
                # Skip rows that contain these phrases
                if row_contains(row_data, "الجدول الدراسي : الفصل الأول 1447(طلاب-الأحساء)") or row_contains(row_data, "صفحة") or row_contains(row_data, "القسم") or (row_contains(row_data, "الوقت") and row_headers):
                        continue

                
                # Initialize writer with header row
                if writer is None:
                    headers = row_data
                    writer = csv.writer(csv_file)
                    writer.writerow(headers)
                    continue
                
                # Write data row
                print(f"row #{row_index} is: {row_data}")
                row_headers=True
                writer.writerow(row_data)
    
    csv_file.close()
    print(f"Data written to {output_csv_path}")

if __name__ == "__main__":
    url = "https://ssb-ar.kfu.edu.sa/PROD_ar/ws?p_trm_code=144710&p_col_code=09&p_sex_code=11" #main kfu courses page for CS
    output_csv = "kfu_schedule.csv"
    scrape_to_csv(url, output_csv)


Found 12 <center> tags.
row #0 is: ['0901-204', '68726', '006', 'غير متاحه', 'البرمجة الهندسية', '3', 'ن', 'نظري', '1330 - 1445', '', '0817-144', '22', '']
row #0 is: ['0901-204', '68726', '006', 'غير متاحه', 'البرمجة الهندسية', '3', 'خ', 'نظري', '1330 - 1445', '', '0817-144', '22', '']
row #0 is: ['0901-204', '56920', '01', 'متاحه', 'البرمجة الهندسية', '3', 'ح', 'نظري', '0900 - 1015', 'مروان محمد امين الحاج', '0817-144', '22', '']
row #0 is: ['0901-204', '56920', '01', 'متاحه', 'البرمجة الهندسية', '3', 'ر', 'نظري', '0900 - 1015', 'مروان محمد امين الحاج', '0817-144', '22', '']
row #0 is: ['0901-204', '28331', '02', 'ممتلئة', 'البرمجة الهندسية', '3', 'ر', 'نظري', '1030 - 1145', 'نعيم حمد', '0817-144', '22', '']
row #0 is: ['0901-204', '28331', '02', 'ممتلئة', 'البرمجة الهندسية', '3', 'ح', 'نظري', '1030 - 1145', 'نعيم حمد', '0817-144', '22', '']
row #0 is: ['0901-204', '64108', '03', 'ممتلئة', 'البرمجة الهندسية', '3', 'ح', 'نظري', '1200 - 1315', 'مروان محمد امين الحاج', '0817-144', '22',

In [1]:
!pip install pdfplumber
import pdfplumber
print(pdfplumber.__version__)

0.11.7


In [105]:
import pdfplumber
import csv
import re

input_pdf = "black_board_1447_10.pdf"
output_csv = "bb_tables_fixed.csv"

# Patterns to detect times, course numbers, and CRNs
time_pattern = re.compile(r"^\d{1,2}\.\d{2}\-\d{1,2}\.\d{2}$")
course_pattern = re.compile(r"^\d{3,4}\-\d{3,4}$")
crn_pattern = re.compile(r"^\d+$")

# Day mapping for Arabic days
day_mapping = {
    "االحد": "ح",
    "االثنين": "ن",
    "الثالثاء": "ث",
    "األربعاء": "ر",
    "الخميس": "خ",
}

with pdfplumber.open(input_pdf) as pdf:
    all_rows = []
    first_row_skipped = False
    tempAppend = ''
    
    for page in pdf.pages:
        tables = page.extract_tables()
        for table in tables:
            for row in table:
                # Skip empty rows
                if not any(row):
                    continue
                
                # Skip the first row
                if not first_row_skipped:
                    first_row_skipped = True
                    continue
                
                # Check if any cell (after index 4) has None in the next position
                has_none_next = False
                for i in range(4, len(row) - 1):
                    if row[i] is not None and row[i + 1] is None:
                        has_none_next = True
                        break
                
                # If row has None in next cell, accumulate tempAppend and skip processing
                if has_none_next:
                    for cell in row[4:]:
                        if cell is not None and cell.strip():
                            tempAppend = tempAppend + " " + cell.strip()
                    # Clear tempAppend if it contains "كليات" (reversed check)
                    if "كليات" in tempAppend[::-1]:
                        tempAppend = ""
                    print(f"Skipping row, accumulated: {tempAppend[::-1]}")
                    continue
                
                # Otherwise, process full rows
                processed_row = []
                
                for currentRowNum, cell in enumerate(row):
                    print(f"current row is {currentRowNum}")
                    
                    if cell is not None:
                        text = cell.strip()
                        
                        # Skip reversal for CRN, course number, time
                        if (
                            time_pattern.match(text)
                            or course_pattern.match(text)
                            or crn_pattern.match(text)
                        ):
                            if time_pattern.match(text):
                                text = text.replace(".", "")
                            processed_row.append(text)
                        else:
                            # For column 4, append accumulated text if available
                            if currentRowNum == 4 and tempAppend:
                                text = text + " " + tempAppend
                                tempAppend = ''  # Clear after using
                                print(f"Added tempAppend to text: {text[::-1]}")
                            
                            # Replace day names
                            reversed_text = text[::-1]
                            if reversed_text in day_mapping:
                                processed_row.append(day_mapping[reversed_text])
                            else:
                                processed_row.append(reversed_text)  # Reverse everything else
                    else:
                        processed_row.append("")
                
                # Add processed row to output (columns 4 to second-to-last)
                row_to_add = processed_row[4:-1]
                # Only add if row is not empty
                if any(cell for cell in row_to_add):
                    all_rows.append(row_to_add)
                print(f"temp append after row: {tempAppend[::-1]}")
                print("-------------------")

# Save CSV with UTF-8 BOM for proper Arabic display in Excel
with open(output_csv, "w", newline="", encoding="utf-8-sig") as f:
    writer = csv.writer(f)
    writer.writerows(all_rows)

print(f"Extracted {len(all_rows)} rows (Arabic reversed except for times, course numbers, and CRNs) and saved to '{output_csv}'.")

current row is 0
current row is 1
current row is 2
current row is 3
current row is 4
current row is 5
current row is 6
current row is 7
current row is 8
current row is 9
current row is 10
current row is 11
current row is 12
current row is 13
temp append after row: 
-------------------
current row is 0
current row is 1
current row is 2
current row is 3
current row is 4
current row is 5
current row is 6
current row is 7
current row is 8
current row is 9
current row is 10
current row is 11
current row is 12
current row is 13
temp append after row: 
-------------------
current row is 0
current row is 1
current row is 2
current row is 3
current row is 4
current row is 5
current row is 6
current row is 7
current row is 8
current row is 9
current row is 10
current row is 11
current row is 12
current row is 13
temp append after row: 
-------------------
current row is 0
current row is 1
current row is 2
current row is 3
current row is 4
current row is 5
current row is 6
current row is 7
curren

In [5]:
#!pip install openai
#Please install OpenAI SDK first: `pip3 install openai`
import os
from openai import OpenAI

client = OpenAI(api_key=os.environ.get("DEEPSEEK_API_KEY"), base_url="https://api.deepseek.com")

response = client.chat.completions.create(
    model="deepseek-chat",
    messages=[
        {"role": "system", "content": "You are a helpful assistant"},
        {"role": "user", "content": "this is a test message"},
    ],
    stream=False
)

print(response.choices[0].message.content)

Hello! I'm here and ready to help. If you have any questions or need assistance with anything, feel free to ask! 😊
