# Import libraries

In [50]:
import requests 
from bs4 import BeautifulSoup
import asyncio
import aiohttp
import json
import random 

# Some website infos

In [51]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36'
}

In [52]:
baseURL = 'https://www.fit.hcmus.edu.vn'

links = [
    '/tin-tuc/c/thong-bao-chung',
    '/tin-tuc/c/thong-tin-hoc-bong',
    '/tin-tuc/c/thong-bao-he-chinh-quy',
    '/tin-tuc/c/thong-bao-he-hoan-chinh',
    '/tin-tuc/c/thong-bao-sau-dai-hoc',
    '/tin-tuc/c/thong-tin-tuyen-dung',
    '/tin-tuc/c/thong-tin-lien-thong-dh-ths',
    '/tin-tuc/c/goc-se-chia'
]

In [53]:
re = requests.get(baseURL, headers=headers)
re.url

'https://www.fit.hcmus.edu.vn/'

# Create Crawler class

In [80]:
class Crawler:
    def __init__(self, baseURL, headers, seeds: list) -> None:
        self.baseurl = baseURL
        self.headers = headers
        self.links = [baseURL + seed for seed in seeds] #Construct complete links
        self.crawl_logic=None

    async def fetch(self, session, url, params=None):
        delay = random.uniform(2,7) #Assign a random delay to prevent banning from too many requests
        await asyncio.sleep(delay)
        async with session.get(url, headers=self.headers, params=params) as response:
            print("Crawled:",url)
            text = await response.text()
            result = json.dumps({
                'url': url,
                'text': text
            })
            return result
        
    async def _crawl(self, depth=1, crawl_logic=None):
        async with aiohttp.ClientSession() as session:
            tasks = []
            for page in range(1,depth+1):
                for url in self.links:
                    task = self.fetch(session, url, params={'p': page})
                    tasks.append(task)
                    
            responses = await asyncio.gather(*tasks)
            if crawl_logic is None:
                return responses
            else: # If the user assign an extra filtering function
                return [crawl_logic(response) for response in responses] 

    # async def crawl(self, crawl_logic, depth):
    #     self.crawl_logic = crawl_logic
    #     loop = asyncio.get_event_loop()
    #     results = loop.run_until_complete(self._crawl(depth))
    #     return results

    def make_urls(self, links):
        return [self.baseurl + link for link in links]
    
    def make_url(self, link):
        return self.baseurl + link


In [81]:
def extract_web_links(response):
    response = json.loads(response)
    soup = BeautifulSoup(response['text'])
    # Find all <a> tags that contain href attributes starting with '/tin-tuc/'
    news_links = soup.find_all('a', href=True)

    # Filter the links that start with '/tin-tuc/'
    filtered_links = [a['href'] for a in news_links if a['href'].startswith('/tin-tuc/d')]
    return list(set(filtered_links))

def extract_news_content(response):
    response = json.loads(response)
    soup = BeautifulSoup(response['text'])
    article = soup.find('article').get_text()
    title = soup.find('h1').get_text()
    file_links = soup.find_all('a', attrs={'download': True})
    file_links = [link['href'] for link in file_links]

    # print(article)
    # print('-----------')
    # print(file_links)
    return {
        'title': title,
        'article': article,
        'file_links': file_links,
        'url': response['url']
    }

# Initialize crawler

In [82]:
import json
with open('./news_links.json', 'r') as rstream:
    news_links = json.load(rstream)
news_links[:5]

['/tin-tuc/d/cq-nghi-hoc-lop-ly-thuyet-so-22_6-ngay-29-3-2024',
 '/tin-tuc/d/chuong-trinh-hoc-bong-chinh-phu-australia-aas-vong-tuyen-chon-2023',
 '/tin-tuc/d/lich-thi-lop-btkt-nganh-khoa-hoc-may-tinh-he-thong-thong-tin-tri-tue-nhan-tao-khoa-thang-4-2024',
 '/tin-tuc/d/thi-dau-code-tour-co-hoi-nang-cap-toan-dien-ky-nang-lap-trinh-danh-cho-ban',
 '/tin-tuc/d/dang-ky-hoc-bong-hanh-trang-huong-nghiep-hoc-ky-2-nam-hoc-2022-2023']

In [83]:
crawler = Crawler(baseURL, headers, news_links)
#crawler = Crawler(baseURL, headers, links)

In [84]:
import nest_asyncio
nest_asyncio.apply()

loop = asyncio.get_event_loop()
# results = loop.run_until_complete(crawler._crawl(3, extract_web_links))
results = loop.run_until_complete(crawler._crawl(crawl_logic=extract_news_content))

Crawled: https://www.fit.hcmus.edu.vn/tin-tuc/d/dang-ky-hoc-bong-hanh-trang-huong-nghiep-hoc-ky-1-nam-hoc-2022-2023
Crawled: https://www.fit.hcmus.edu.vn/tin-tuc/d/danh-sach-du-kien-cap-xet-hoc-bong-dot-xuat-hk1-nam-hoc-2023-2024
Crawled: https://www.fit.hcmus.edu.vn/tin-tuc/d/thong-bao-v-v-chuan-bi-ho-so-dang-ky-thuc-hien-de-tai-tot-nghiep-16208
Crawled: https://www.fit.hcmus.edu.vn/tin-tuc/d/summer-school-2024-alphageometry-solving-imo-geometry-without-human-demonstrations
Crawled: https://www.fit.hcmus.edu.vn/tin-tuc/d/thong-bao-ve-viec-dang-ky-de-tai-luan-van-thac-si-khoa-32-2022-dot-2
Crawled: https://www.fit.hcmus.edu.vn/tin-tuc/d/q2q3-golden-owl-solutions-tuyen-dung-nhieu-vi-tri-it-non-it
Crawled: https://www.fit.hcmus.edu.vn/tin-tuc/d/thong-bao-mo-lop-bo-tuc-kien-thuc-nganh-khoa-hoc-may-tinh-he-thong-thong-tin-tri-tue-nhan-tao-khoa-thang-9-2024
Crawled: https://www.fit.hcmus.edu.vn/tin-tuc/d/thong-bao-dang-ky-phuong-thuc-dao-tao-danh-cho-hoc-vien-cao-hoc-khoa-33-2023
Crawled: h

  for attr in list(attrs.keys()):


# Data profiling and remove nulls

In [86]:
import pandas as pd

df = pd.DataFrame(results)
df.head()

Unnamed: 0,title,article,file_links,url
0,[CQ] Nghỉ học lớp Lý thuyết số 22_6 ngày 29/3/...,"\nTHÔNG BÁOLớp Lý thuyết số 22_6, lịch học T6 ...",[],https://www.fit.hcmus.edu.vn/tin-tuc/d/cq-nghi...
1,Chương trình Học bổng Chính phủ Australia (AAS...,\nPhòng Quan hệ Đối ngoại xin thông tin về Chư...,[https://www.fit.hcmus.edu.vn/vn/UserFiles\711...,https://www.fit.hcmus.edu.vn/tin-tuc/d/chuong-...
2,"Lịch thi lớp BTKT ngành Khoa học máy tính, Hệ ...",\nTHÔNG BÁOLịch thi lớp BTKT ngành Khoa học má...,[],https://www.fit.hcmus.edu.vn/tin-tuc/d/lich-th...
3,THI ĐẤU CODE TOUR - CƠ HỘI “NÂNG CẤP” TOÀN DIỆ...,\nĐăng ký ngay: https://bit.ly/4bWtGGL Đếm ngư...,[https://www.fit.hcmus.edu.vn/vn/UserFiles\772...,https://www.fit.hcmus.edu.vn/tin-tuc/d/thi-dau...
4,"Đăng ký học bổng ""Hành trang hướng nghiệp"" học...",\nNhằm khen thưởng cho những sinh viên có thàn...,[],https://www.fit.hcmus.edu.vn/tin-tuc/d/dang-ky...


In [87]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160 entries, 0 to 159
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   title       160 non-null    object
 1   article     160 non-null    object
 2   file_links  160 non-null    object
 3   url         160 non-null    object
dtypes: object(4)
memory usage: 5.1+ KB


In [88]:
df.to_csv('./Sample_FIT_News.csv', index=False)