In [1]:
import random
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from urllib.parse import urljoin, urlparse
from tqdm import tqdm
import pickle

In [2]:
import os
import openai
import sys
import langchain

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

langchain.debug=True

### Loading URLS

In [3]:
# Load dictionary from the file
with open('../data/end_page_urls.pickle', 'rb') as handle:
    url_dict = pickle.load(handle)

# url_dict
url_dict

{'https://enm.yildiz.edu.tr/': True,
 'https://enm.yildiz.edu.tr/#main-content': True,
 'https://enm.yildiz.edu.tr/hakkimizda': True,
 'https://enm.yildiz.edu.tr/hakkimizda/aday-ogrenciler': True,
 'https://enm.yildiz.edu.tr/hakkimizda/bolum-baskanligimiz': True,
 'https://enm.yildiz.edu.tr/sites/enm.yildiz.edu.tr/images/files/2-PÇ Ders İlişki Matrisi (GENEL) 02_03_2022.pdf': 'PDF',
 'https://enm.yildiz.edu.tr/hakkimizda/ders-program-ciktilari-matrisi': True,
 'https://enm.yildiz.edu.tr/hakkimizda/ders-plan-ve-icerikleri': True,
 'https://enm.yildiz.edu.tr/hakkimizda/egitim-amaclarimiz': True,
 'https://enm.yildiz.edu.tr/hakkimizda/misyon-ve-vizyonumuz': True,
 'https://enm.yildiz.edu.tr/hakkimizda/program-ciktilarimiz': True,
 'https://enm.yildiz.edu.tr/kisiler': True,
 'https://enm.yildiz.edu.tr/kisiler/tam-zamanli-ogretim-uyeleri': True,
 'https://enm.yildiz.edu.tr/kisiler/arastirma-gorevlileri': True,
 'https://enm.yildiz.edu.tr/kisiler/muhendisler': True,
 'https://enm.yildiz.e

### Scraping

In [5]:
def find_longest_common_subtexts(*texts, min_length=100) -> list:
    """
    Finds the largest common substring in a list of strings.
    
    Parameters:
    - texts (tuple): Variable number of text arguments.
    - min_length (int): Minimum length of the common substring to consider it significant.
    
    Returns:
    - str: The longest common substrings found across all input texts that meets or exceeds min_length.
    """
    
    def lcs(str1, str2):
        """
        Helper function to find the longest common substring between two strings using dynamic programming.
        """
        m, n = len(str1), len(str2)
        dp = [[0] * (n+1) for _ in range(2)]
        max_length, end_pos = 0, 0
        
        for i in range(1, m+1):
            for j in range(1, n+1):
                if str1[i-1] == str2[j-1]:
                    dp[i % 2][j] = dp[(i-1) % 2][j-1] + 1
                    if dp[i % 2][j] > max_length:
                        max_length = dp[i % 2][j]
                        end_pos = i
                else:
                    dp[i % 2][j] = 0
                    
        return str1[end_pos - max_length: end_pos]
    
    def find_largest_common_subtext(texts, min_length):
        if len(texts) < 2:
            return ""
        
        common_subtext = lcs(texts[0], texts[1])
        
        for text in texts[2:]:
            if len(common_subtext) < min_length:
                return ""
            common_subtext = lcs(common_subtext, text)
            if len(common_subtext) < min_length:
                return ""
        
        return common_subtext if len(common_subtext) >= min_length else ""
    
    common_subtexts = []
    modified_texts = list(texts)
    while True:
        common_subtext = find_largest_common_subtext(modified_texts, min_length)
        if not common_subtext:
            break
        common_subtexts.append(common_subtext)
        
        modified_texts = [text.replace(common_subtext, "") for text in modified_texts]
    
    return common_subtexts



def process_urls(url_dict, random_common_docs_n=3):
    web_docs = []
    unprocessed_urls = []
    
    # Choose random pages to find the common subtext and remove them.
    random_docs = []
    while random_common_docs_n:
        url, url_type = random.choice(list(url_dict.items()))
        if url_type != 'PDF':
            try:
                random_docs.append(WebBaseLoader(url).load()[0].page_content)
                random_common_docs_n -= 1
            except Exception as e:
                print(f'Error while choosing random doc: {str(e)}')
    
    common_subtexts = find_longest_common_subtexts(*random_docs)
    
    
    # Scraping and cleaning web pages 
    for i, url in enumerate(url_dict):
        try:
            print(f'Scraping url {i+1}')
            if url_dict[url] != 'PDF':
                web_loader = WebBaseLoader(url)
                web_doc = web_loader.load()
                edited_content = web_doc[0].page_content
                
                for common_subtext in common_subtexts:
                    edited_content = edited_content.replace(common_subtext, "")
                while "\n\n\n" in edited_content:
                    edited_content = edited_content.replace("\n\n\n", "")

                    web_doc[0].page_content = edited_content
                web_docs.append(web_doc[0])
            else:
                loader = PyPDFLoader(url)
                pages = loader.load()
                edited_content = pages[0].page_content
                while "\n\n\n" in edited_content:
                    edited_content = edited_content.replace("\n\n\n", "")
                web_docs.append(pages[0])
                
        except Exception as e:
            print(f'Error scraping url {i+1}: {str(e)}')
            unprocessed_urls.append(url)

    print('Scraping Completed')
    return web_docs, unprocessed_urls



def text_splitter(web_docs):
    print('\nSplitting text ...')
    
    r_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1024, 
        chunk_overlap=256,
        separators=["\n\n", "\n", ". ", " ", ""]
    )

    splitted_docs = r_splitter.split_documents(web_docs)
    splitted_docs = [split for split in splitted_docs if len(split.page_content) > 150]
    
    print(f'Splitted chunks number: {len(splitted_docs)}\n')
    return splitted_docs



def extract_text_from_url(url):
    """
    Extract text from the end of the URL, correctly handling /text and /text/number cases.
    """
    # Extracting the path from the URL and splitting it
    path_segments = urlparse(url).path.split('/')
    # Filtering out empty segments and digits-only segments
    text_segments = [segment for segment in path_segments if segment and not segment.isdigit()]

    # The last segments in text_segments should be the text we want to extract
    if len(text_segments) >= 2:
        # If there are at least 2 elements, return the last two joined by ' / '
        return text_segments[-2] + ' / ' + text_segments[-1]
    elif len(text_segments) == 1:
        # If there's only one element, return it
        return text_segments[-1]
    else:
        # If the list is empty, return None
        return None


def add_metadata(splitted_docs):
    """
    Adds the metadata at the beginning of each document, accounting for missing title or page number.
    """
    for doc in tqdm(splitted_docs, desc="Adding metadata to documents"):
        # Initialize the metadata string
        metadata_str = f"Source: {doc.metadata['source']}\n"
        
        # Conditionally add title if it exists
        if 'title' in doc.metadata and doc.metadata['title']:
            metadata_str += f"Page Title: {doc.metadata['title']}\n"
        
        # Add subject extracted from source URL
        metadata_str += f"Subject: {extract_text_from_url(doc.metadata['source'])}\n"
        
        # Conditionally add PDF page number if it's a PDF and the page number exists
        if doc.metadata['source'].endswith('.pdf') and 'page' in doc.metadata:
            metadata_str += f"Pdf Page Number: {doc.metadata['page']}\n"
        
        # Append the updated metadata to the page content
        doc.page_content = metadata_str + "\n" + doc.page_content
    
    return splitted_docs



def enhance_meta_data(processed_docs):
    '''
    Check if the document is an anouncement/duyuru/haber document and adds
    this information to the metada. 
    '''
    for doc in tqdm(processed_docs, desc="Enhancing documents metadata"):
        if 'haberler' in doc.metadata['source']:
            doc.metadata['Type'] = 'haber'
        elif 'duyuru' in doc.metadata['source']:
            doc.metadata['Type'] = 'duyuru'
        elif 'etkinlik' in doc.metadata['source']:
            doc.metadata['Type'] = 'etkinlik'
        else:
            doc.metadata['Type'] = 'general'
            
    return processed_docs
            


def Preprocess(url_dict):
    
    web_docs, unprocessed_urls = process_urls(url_dict)
    splitted_docs = text_splitter(web_docs)
    processed_docs = add_metadata(splitted_docs)
    preprocessed_docs = enhance_meta_data(processed_docs)
    print('\nYou are done!')
    
    return preprocessed_docs


preprocessed_docs = Preprocess(url_dict)

Scraping url 1
Scraping url 2
Scraping url 3
Scraping url 4
Scraping url 5
Scraping url 6
Scraping url 7
Scraping url 8
Scraping url 9
Scraping url 10
Scraping url 11
Scraping url 12
Scraping url 13
Scraping url 14
Scraping url 15
Scraping url 16
Scraping url 17
Scraping url 18
Scraping url 19
Scraping url 20
Scraping url 21
Scraping url 22
Scraping url 23
Scraping url 24
Scraping url 25
Scraping url 26
Scraping url 27
Scraping url 28
Scraping url 29
Scraping url 30
Scraping url 31
Scraping url 32
Scraping url 33
Scraping url 34
Scraping url 35
Scraping url 36
Scraping url 37
Scraping url 38
Scraping url 39
Scraping url 40
Scraping url 41
Scraping url 42
Scraping url 43
Scraping url 44
Scraping url 45
Scraping url 46
Scraping url 47
Scraping url 48
Scraping url 49
Scraping url 50
Scraping url 51
Scraping url 52
Scraping url 53
Scraping url 54
Scraping url 55
Scraping url 56
Scraping url 57
Scraping url 58
Scraping url 59
Scraping url 60
Scraping url 61
Scraping url 62
Scraping url 63
S

Adding metadata to documents: 100%|███████| 647/647 [00:00<00:00, 219343.25it/s]
Enhancing documents metadata: 100%|██████| 647/647 [00:00<00:00, 2351572.52it/s]


You are done!





In [7]:
print(preprocessed_docs[10].page_content)
print('\n----------------------\n')
print(preprocessed_docs[10].metadata)

Source: https://enm.yildiz.edu.tr/#main-content
Page Title: Anasayfa | YTÜ Endüstri Mühendisliği Bölümü
Subject: None

08Mar
                   Sabancı Üniversitesi Endüstri Mühendisliği Optimization Challenge Hk. 
          
 

01Mar
                   Kabul Edilen Ders Çakışmaları Hk. 
          TÜMÜ

 

Etkinlikler

YIL2024202320222021AYOcakŞubatMartNisanMayısHaziranTemmuzAğustosEylülEkimKasımAralıkTümü 18Mar
                   Case in Point Vaka Analizi Yarışması YTÜ Mentorluk Kulübü 
         
           28Şub
                   WHEN STARS ALIGN_YTU Mentorluk Kulübü 
         
           09Şub
                   19th International Supply Chain Camp Etkinliği 
         
           11Mar
                   StarTech (Part-Time Mühendis Öğrenci Programı) 
         
           04Mar
                   Yaz Staj Okulu 
         
          

TÜMÜ

Hızlı Erişim
AdaylarEğitim Yönetim SistemiÖğrenci Bilgi SistemleriAkademik TakvimDers ProgramlarıSınav ProgramlarıAkreditasyonYönetmeliklerMezu

## Vectorstores

In [8]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

embedding = OpenAIEmbeddings()

vectordb = Chroma.from_documents(
    documents=preprocessed_docs,
    embedding=embedding,
    persist_directory='../db/end_website'
)
vectordb.persist()

# vectordb = Chroma(persist_directory='./db/...', embedding_function=embedding)

In [136]:
# vectordb.persist()
