**Objective:**
Crawling user, post, comment data on 'https://voz.vn/f/tien-%C4%91ien-tu.94/'

## CRAWLING USERS DATA

In [None]:
# import modules
from bs4 import BeautifulSoup
import urllib.request
import requests
import re
import pandas as pd
import numpy as np
import time 
pd.set_option('display.max_rows', 500)

In [None]:
# print loading bar
def print_loading_bar(iteration, total, bar_length=40):
    # Calculate the percentage of completion
    percent = (iteration / total)
    filled_length = int(bar_length * percent)  # Calculate filled length of the bar
    bar = '█' * filled_length + '-' * (bar_length - filled_length)  # Create the loading bar
    print(f'\r|{bar}| {percent:.2%}', end='')  # Print the loading bar with percentage

# down load html
def download_html(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    req = urllib.request.Request(url, headers=headers)
    with urllib.request.urlopen(req) as response:
        html = response.read().decode('utf-8')
    response.close()
    return html

# get topic links
def get_topic_links(url):
    max_page = 200
    topic_links = []
    for i in range(1, max_page + 1):
        page_link = url + str('page-') + str(i)
        page_response = download_html(page_link)
        soup = BeautifulSoup(page_response, 'html.parser')
        raw_topic_links = soup.find_all('div', {'class':'structItem-title'})
        for raw in raw_topic_links:
            topic_links.append(raw.find_all('a')[1]['href'])
    return topic_links        

def get_user_details(soup):

    # get users infomation from topic links, include: id, name, level, profile_link 

    try:
        id = [item.find('a')['data-user-id'] for item in soup.find_all('h4', {'class': "message-name"})]
    except:
        id = None

    try:
        name = [item.find('a', {"class": "username"}).text for item in soup.find_all('h4', {'class': "message-name"})]
    except:
        name = None

    try: 
        level = [item.text for item in soup.find_all('h5', {'class': "userTitle message-userTitle"})]
    except:
        level = None

    try:
        profile_link = [item.find('a')['href'] for item in soup.find_all('h4', {'class': "message-name"})]
    except:
        profile_link = None
    
    return [id, name, level, profile_link]

# get user_details of users commenting on the topic
def get_user(topic_link):
    N_MAX = 1000 # limit num page
    n_page = 1 # count num page

    page_response = download_html(url=topic_link)
    soup = BeautifulSoup(page_response, 'html.parser')
    users = get_user_details(soup) 
        
    while n_page < N_MAX:
        try:  
            next_page = soup.find('a', {"class": "pageNav-jump pageNav-jump--next"})
        except:
            next_page = None
        # doesn't have a next page
        if not next_page: 
            return users
        
        n_page += 1
        next_page = 'https://voz.vn/' + next_page['href']
        page_response = download_html(url=next_page)
        soup = BeautifulSoup(page_response, 'html.parser')

        user_infos = get_user_details(soup) 
        users = [a + b for a, b in zip(users, user_infos)]
                
    return users

def get_user_details2(soup):

    # get user_details by going directly to profile_link (only applies to profiles with full profile view permission)
    
    try:
        id = soup.find('h1', {"class": "memberHeader-name"}).find('span', {"class": "username"})['data-user-id']
    except:
        id = None
    try:
        name = soup.find('h1', {"class": "memberHeader-name"}).find('span', {"class": "username"}).text
    except:
        name = None
    try:
        level = soup.find('span', {"class": "userTitle"}).text
    except:
        level = None
    try:
        joined_time = soup.find('dl', {"class": "pairs pairs--inline"}).find('time')['data-timestamp']
    except:
        joined_time = None
    try:
        point = soup.find_all('dl', {'class': "pairs pairs--rows pairs--rows--centered fauxBlockLink"})[-1].find('a', {'class': 'fauxBlockLink-linkRow u-concealed'}).text.strip()
    except:
        point = None
    try:
        react_score = soup.find('dl', {'class': "pairs pairs--rows pairs--rows--centered"}).find('dd').text.strip()
    except:
        react_score = None
    try:
        num_message = soup.find('dl', {'class': "pairs pairs--rows pairs--rows--centered fauxBlockLink"}).find('a', {"class": "fauxBlockLink-linkRow u-concealed"}).text.strip()
    except:
        num_message = None

    return [id, name, level, joined_time, num_message, react_score, point]
        

get topic links on 'https://voz.vn/f/tien-%C4%91ien-tu.94/'

In [7]:
# get all topic links from page 0 to 200 on 'https://voz.vn/f/tien-%C4%91ien-tu.94/'
url = 'https://voz.vn/f/tien-%C4%91ien-tu.94/'
topic_links = get_topic_links(url)
topic_links = ['https://voz.vn'+item for item in topic_links]
df_topic_links = pd.DataFrame(topic_links, columns = ['topic_links'])
df_topic_links.to_csv("dataset/topic_links.csv")

|████████████████████████████████████████| 100.00%

get user infomation: id, name, level, profile_link

In [None]:
# get users infomation of users commenting on the topic
topic_links = pd.read_csv('dataset/topic_links.csv')['topic_links'].to_list()
topic_links_0 = topic_links[0]
topic_links.remove(topic_links_0)

users = get_user(topic_link = topic_links_0)
for i, link in enumerate(topic_links):
    user = get_user(link)
    users = [a + b for a, b in zip(users, user)]
    print_loading_bar(i, len(topic_links))

|████████████████████████████████████████| 100.00%

In [None]:
# remove duplicates and save to data_users_1.csv
df_users_1 = pd.DataFrame(users).T
df_users_1.columns = ['id', 'name', 'level', 'profile_link']
df_users_1.drop_duplicates(inplace = True)
df_users_1.reset_index(drop=True, inplace=True)
df_users_1.to_csv('dataset/data_users_1.csv')

In [None]:
df_users_1

Unnamed: 0,id,name,level,profile_link
0,1736979,Quantum,Senior Member,/u/quantum.1736979/
1,1428951,HuyRongDen,Senior Member,/u/huyrongden.1428951/
2,1722728,thuongbui060,Senior Member,/u/thuongbui060.1722728/
3,1698406,Fujifilm XT3,Senior Member,/u/fujifilm-xt3.1698406/
4,1476978,Mr_X_f33,Senior Member,/u/mr_x_f33.1476978/
...,...,...,...,...
9324,1745679,chowchoww,Junior Member,/u/chowchoww.1745679/
9325,1452373,em_la_teo2,Junior Member,/u/em_la_teo2.1452373/
9326,1185670,Mike1618,Junior Member,/u/mike1618.1185670/
9327,1760267,dynamic programming,Senior Member,/u/dynamic-programming.1760267/


get user infomation: id, name, level, joined_time, num_message, reaction_score, point

In [None]:
# get user_details by going directly to profile_link (only applies to profiles with full profile view permission)

base_url = 'http://voz.vn'
user_links = pd.read_csv('dataset/data_users_1.csv')['profile_link'].to_list()

# Transform the list
transformed_list = [
    url if url.startswith('http://') else base_url + url
    for url in user_links
]
user_links = transformed_list

# get user infomation by profile link
users2 = []
user_info = []

for i, link in enumerate(user_links):
    try:
        response = download_html(url=link)
    except:
        response = None
    if response:
        soup = BeautifulSoup(response, 'html.parser')
        user_info = get_user_details2(soup)
        if user_info != [None]*7:
            users2.append(user_info)
    print_loading_bar(i, len(user_links))


|████████████████████████████████████████| 100.00%

In [None]:
# remove duplicates and save to data_users_2.csv
df_users_2 = pd.DataFrame(users2)
df_users_2.columns = ['id', 'name', 'level', 'joined_time', 'num_message', 'reaction_score', 'point']
df_users_2.drop_duplicates(inplace = True)
df_users_2.reset_index(drop=True, inplace=True)
df_users_2.to_csv('/kaggle/working//data_users_2.csv')

In [None]:
df_users_2

Unnamed: 0,id,name,level,joined_time,num_message,reaction_score,point
0,1476978,Mr_X_f33,Senior Member,1422082104,2165,2711,113
1,1483896,CuChanh33,Senior Member,1424744870,1062,1000,113
2,321528,haitrang307,Đã tốn tiền,1260005242,1390,424,83
3,1536345,beerdz92,Senior Member,1460824659,4198,9470,113
4,1425082,Tu Anh the Pig,Senior Member,1403934525,1963,1443,113
...,...,...,...,...,...,...,...
7776,1714706,22phuphu,Junior Member,1612018326,16,3,3
7777,1745679,chowchoww,Junior Member,1624614558,65,13,8
7778,1185670,Mike1618,Junior Member,1357891021,14,0,1
7779,1452373,em_la_teo2,Junior Member,1416919773,16,1,3


## CRAWLING POST & COMMENT DATA

In [None]:
def download_html(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    req = urllib.request.Request(url, headers=headers)
    with urllib.request.urlopen(req) as response:
        html = response.read().decode('utf-8')
    response.close()
    return html

In [None]:
def get_comments(url, post, comments):
    html = download_html(url)
    soup = BeautifulSoup(html, 'lxml')
    user_id, comment_id, comment_time = [], [], []

    for i in soup.find_all('h4', {'class': 'message-name'}):
        user_id.append(i.find('a')['data-user-id'])

    for i in soup.find_all('li',{'class':'u-concealed'}):
        comment_time.append(i.find('time')['datetime'])
        comment_id.append(i.find('a')['href'])

    comment = soup.find_all('div',{'class':'bbWrapper'})

    for i in range(len(comment_id)):
        comments.append([comment_id[i], user_id[i], post, comment[i].text, comment_time[i]])

In [None]:
def get_details(url, posts, comments):
    html = download_html(url)
    soup = BeautifulSoup(html, 'lxml')
    post_id, user_id, title, post_time, views, replies = [], [], [], [], [], []
    for i in soup.find_all('div',{'class':'structItem-title'}):
        post_id.append('https://voz.vn' + i.find_all('a')[-1]['href'])


    for i in range(len(post_id)):
        #posts.append([post_id[i], user_id[i], title[i], post_time[i], views[i], replies[i]])
        get_comments(post_id[i], post_id[i], comments)
        try:
            html1 = download_html(post_id[i])
            soup1 = BeautifulSoup(html1, 'lxml')
            page = soup1.find_all('li',{'class':'pageNav-page'})[-1].text
        except:
            page = 0
        if int(page) > 1:
            for j in range(2, int(page) + 1):
                link = post_id[i] + 'page-' + str(j)
                get_comments(link, post_id[i], comments)
            #print(i, j)

In [None]:
url = 'https://voz.vn/f/tien-%C4%91ien-tu.94/'
link = []
posts = []
comments = []
for i in list(range(0,7)) + list(range(8,340)):
    link='https://voz.vn/f/tien-%C4%91ien-tu.94/page-' + str(i)
    get_details(link, posts, comments)

In [1]:
posts_data = pd.DataFrame(posts, columns = ['Post_id','User_id', 'Title','Post_time','Views','Replies'])
posts_data

Unnamed: 0,Post_id,User_id,Title,Post_time,Replies,Views
0,926433,1736979,"Bitcoin vượt mốc 60.000 USD, tiến gần kỷ lục",2024-02-28T23:36:06+0700,623,78000
1,986421,2020115,[Cảnh báo] Tuyệt đối không liên hệ - giao dịch...,2024-07-17T11:55:20+0700,0,2000
2,780221,268,Nội quy box Tiền điện tử - Đọc kỹ trước khi ho...,2023-05-26T13:43:15+0700,0,6000
3,792751,1275125,Lúc này không múc Bitcoin còn đợi lúc nào???,2023-06-15T20:27:27+0700,88000,5000000
4,934397,1630609,Tổng hợp kèo đào coin trên Telegram uy tín nhấ...,2024-03-17T16:20:34+0700,195,19000
...,...,...,...,...,...,...
6757,329827,1742855,[KÈO SIÊU NGON] Đào coin MNC kiếm 10$/ngày bán...,2021-06-22T12:13:26+0700,7,1000
6758,330257,835165,"MoonEdge - Kèo vừa Airdrop và IDO, còn 2 ngày ...",2021-06-22T21:54:03+0700,5,286
6759,330541,835165,BDVXP - Kèo mạng XTZ nhé! Đã check UY TÍN!,2021-06-23T11:14:24+0700,6,227
6760,328741,1744614,BOUNTY CONTEST SIÊU KHỦNG (CƠ HỘI NHẬN TỚI 10 ...,2021-06-20T20:05:09+0700,20,896


In [None]:
comments_data = pd.DataFrame(comments, columns = ['comment_id','User_id', 'Post_id','Comment','Comment_time'])
comments_data

Unnamed: 0,comment_id,User_id,Post_id,Comment,Comment_time
0,/t/bitcoin-vuot-moc-60-000-usd-tien-gan-ky-luc...,1736979,https://voz.vn/t/bitcoin-vuot-moc-60-000-usd-t...,\n\n\nGiá Bitcoin đã vượt mốc 60.000 USD trong...,2024-02-28T23:36:06+0700
1,/t/bitcoin-vuot-moc-60-000-usd-tien-gan-ky-luc...,1736979,https://voz.vn/t/bitcoin-vuot-moc-60-000-usd-t...,"Các tài sản đua nhau phá ATH, tiền thành giấy ...",2024-02-28T23:36:46+0700
2,/t/bitcoin-vuot-moc-60-000-usd-tien-gan-ky-luc...,1428951,https://voz.vn/t/bitcoin-vuot-moc-60-000-usd-t...,Ai hold giờ hưởng trái ngọt xả thôi .\n\nvia t...,2024-02-28T23:37:17+0700
3,/t/bitcoin-vuot-moc-60-000-usd-tien-gan-ky-luc...,1722728,https://voz.vn/t/bitcoin-vuot-moc-60-000-usd-t...,Vãi thật lên khiếp,2024-02-28T23:37:21+0700
4,/t/bitcoin-vuot-moc-60-000-usd-tien-gan-ky-luc...,1698406,https://voz.vn/t/bitcoin-vuot-moc-60-000-usd-t...,"khả năng đợt này lên 100k thật, ae voz all in ...",2024-02-28T23:38:23+0700
...,...,...,...,...,...
37943,/t/keo-khong-rui-ro-an-chac-cap-von-500k-lai-%...,1178425,https://voz.vn/t/keo-khong-rui-ro-an-chac-cap-...,ngon nha ae,2024-11-25T09:24:02+0700
37944,/t/keo-khong-rui-ro-an-chac-cap-von-500k-lai-%...,1178425,https://voz.vn/t/keo-khong-rui-ro-an-chac-cap-...,kèo ngon làm sớm nha ae,2024-11-26T09:41:41+0700
37945,/t/keo-khong-rui-ro-an-chac-cap-von-500k-lai-%...,1178427,https://voz.vn/t/keo-khong-rui-ro-an-chac-cap-...,Kèo ngon làm sớm nha ae ! Cứ đăng kí xong nhắn...,2024-11-26T14:13:07+0700
37946,/t/keo-khong-rui-ro-an-chac-cap-von-500k-lai-%...,1178425,https://voz.vn/t/keo-khong-rui-ro-an-chac-cap-...,Kèo ngon làm sớm ae,2024-11-27T10:10:02+0700


In [None]:
posts_data.to_csv('Posts.csv', encoding='utf-8', index=False)
comments_data.to_csv('Comments.csv', encoding='utf-8', index=False)