# Import libraries

In [2]:
import requests 
from bs4 import BeautifulSoup
import asyncio
import aiohttp
import json
import random 

# Some website infos

In [3]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36'
}

In [4]:
baseURL = 'https://www.fit.hcmus.edu.vn'

links = [
    '/tin-tuc/c/thong-bao-chung',
    '/tin-tuc/c/thong-tin-hoc-bong',
    '/tin-tuc/c/thong-bao-he-chinh-quy',
    '/tin-tuc/c/thong-bao-he-hoan-chinh',
    '/tin-tuc/c/thong-bao-sau-dai-hoc',
    '/tin-tuc/c/thong-tin-tuyen-dung',
    '/tin-tuc/c/thong-tin-lien-thong-dh-ths',
    '/tin-tuc/c/goc-se-chia'
]

In [None]:
re = requests.get(baseURL, headers=headers)
re.url

# Create Crawler class

In [6]:
class Crawler:
    def __init__(self, baseURL, headers, seeds: list) -> None:
        self.baseurl = baseURL
        self.headers = headers
        self.links = [baseURL + seed for seed in seeds] #Construct complete links
        self.crawl_logic=None

    async def fetch(self, session, url, params=None):
        delay = random.uniform(2,7) #Assign a random delay to prevent banning from too many requests
        await asyncio.sleep(delay)
        async with session.get(url, headers=self.headers, params=params) as response:
            print("Crawled:",url)
            text = await response.text()
            result = json.dumps({
                'url': url,
                'text': text
            })
            return result
        
    async def _crawl(self, depth=1, crawl_logic=None):
        async with aiohttp.ClientSession() as session:
            tasks = []
            for page in range(1,depth+1):
                for url in self.links:
                    task = self.fetch(session, url, params={'p': page})
                    tasks.append(task)
                    
            responses = await asyncio.gather(*tasks)
            if crawl_logic is None:
                return responses
            else: # If the user assign an extra filtering function
                return [crawl_logic(response) for response in responses] 

    # async def crawl(self, crawl_logic, depth):
    #     self.crawl_logic = crawl_logic
    #     loop = asyncio.get_event_loop()
    #     results = loop.run_until_complete(self._crawl(depth))
    #     return results

    def make_urls(self, links):
        return [self.baseurl + link for link in links]
    
    def make_url(self, link):
        return self.baseurl + link


In [7]:
def extract_web_links(response):
    response = json.loads(response)
    soup = BeautifulSoup(response['text'])
    # Find all <a> tags that contain href attributes starting with '/tin-tuc/'
    news_links = soup.find_all('a', href=True)

    # Filter the links that start with '/tin-tuc/'
    filtered_links = [a['href'] for a in news_links if a['href'].startswith('/tin-tuc/d')]
    return list(set(filtered_links))

def extract_news_content(response):
    response = json.loads(response)
    soup = BeautifulSoup(response['text'])
    article = soup.find('article').get_text()
    title = soup.find('h1').get_text()
    file_links = soup.find_all('a', attrs={'download': True})
    file_links = [link['href'] for link in file_links]

    # print(article)
    # print('-----------')
    # print(file_links)
    date = soup.find('li', attrs={'class': 'post-date'}).get_text()
    return {
        'title': title,
        'article': article,
        'file_links': file_links,
        'created_at': date,
        'url': response['url']
    }

# Initialize crawler

In [9]:
# import json
# with open('./news_links.json', 'r') as rstream:
#     news_links = json.load(rstream)
# news_links[:5]

## Extract news links

In [16]:
#crawler = Crawler(baseURL, headers, news_links)
crawler = Crawler(baseURL, headers, links)

In [None]:
import nest_asyncio
nest_asyncio.apply()

loop = asyncio.get_event_loop()
news_links = loop.run_until_complete(crawler._crawl(15, extract_web_links))


In [18]:
news_links = [x for xs in news_links for x in xs]

## Extract web content

In [19]:
crawler = Crawler(baseURL, headers, news_links)

In [None]:
import nest_asyncio
nest_asyncio.apply()

loop = asyncio.get_event_loop()
# results = loop.run_until_complete(crawler._crawl(3, extract_web_links))
results = loop.run_until_complete(crawler._crawl(crawl_logic=extract_news_content))

# Data profiling and remove nulls

In [None]:
import pandas as pd

df = pd.DataFrame(results)
df.head()

In [None]:
df.info()

In [88]:
df.to_csv('./Sample_FIT_News.csv', index=False)

***
# Playground

In [21]:
import pandas as pd

df = pd.read_csv('./data/FIT_news.csv')
df.info()

ModuleNotFoundError: No module named 'pandas'