# PREPROCESSING

In [2]:
import pandas as pd  
import numpy as np 
import re

## 1. USERS DATA

In [5]:
# df_user1 and df_user_2
df_user1 = pd.read_csv('dataset/data_users_1.csv')
df_user2 = pd.read_csv('dataset/data_users_2.csv')

# df_user1
# drop 1st column
df_user1.drop(df_user1.columns[0], axis=1, inplace=True)
# profile_link
profile_links = df_user1['profile_link'].astype(str)
for i, link in enumerate(profile_links):
    profile_links[i] = 'https://voz.vn' + link
df_user1['profile_link'] = profile_links

# df_user2
# # drop 1st column
df_user2.drop(df_user2.columns[0], axis=1, inplace=True)

# num_message
num_message = df_user2['num_message'].astype(str)
for i, item in enumerate(num_message):
    digits = re.findall(r'\d+', item) 
    num_message[i] = ''.join(digits) if digits else None
df_user2['num_message'] = pd.to_numeric(num_message)

# reaction_score
reaction_score = df_user2['reaction_score'].astype(str)
for i, item in enumerate(reaction_score):
    digits = re.findall(r'\d+', item) 
    reaction_score[i] = ''.join(digits) if digits else None
df_user2['reaction_score'] = pd.to_numeric(reaction_score)

# joined_time
df_user2['joined_time'] = pd.to_datetime(df_user2['joined_time'], utc=True, unit='s')

In [6]:
# merge 
pd.set_option('display.max_rows', 500)
df_user = df_user1.set_index(['id', 'name', 'level']).join(df_user2.set_index(['id', 'name', 'level'])).reset_index()


In [10]:
df_user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9329 entries, 0 to 9328
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   id              9329 non-null   int64              
 1   name            9328 non-null   object             
 2   level           9329 non-null   object             
 3   profile_link    9329 non-null   object             
 4   joined_time     6782 non-null   datetime64[ns, UTC]
 5   num_message     6782 non-null   float64            
 6   reaction_score  6782 non-null   float64            
 7   point           6782 non-null   float64            
dtypes: datetime64[ns, UTC](1), float64(3), int64(1), object(3)
memory usage: 583.2+ KB


In [None]:
df_user.head()

Unnamed: 0,id,name,level,profile_link,joined_time,num_message,reaction_score,point
0,1736979,Quantum,Senior Member,https://voz.vn/u/quantum.1736979/,,,,
1,1428951,HuyRongDen,Senior Member,https://voz.vn/u/huyrongden.1428951/,,,,
2,1722728,thuongbui060,Senior Member,https://voz.vn/u/thuongbui060.1722728/,,,,
3,1698406,Fujifilm XT3,Senior Member,https://voz.vn/u/fujifilm-xt3.1698406/,,,,
4,1476978,Mr_X_f33,Senior Member,https://voz.vn/u/mr_x_f33.1476978/,2015-01-24 06:48:24+00:00,2165.0,2711.0,113.0


impute missing values  
- only 1 row doesn't have a name -> drop  
- almost admin private their account -> joined_time: oldest; num_message, reaction_score, point: higher (MICE)  

In [None]:
# Function to fill NaN with a random value from top_values
def fill_na_with_random(value, top_values):
    if pd.isna(value):
        return top_values.sample(1).values[0]  # Randomly select one value from top_values
    return value

# get the top 10 most frequent values in the 'joined_time' column
top_values = df_user['joined_time'].sort_values().head(50)
# fill missing values with one of the top 50 values
df_user['joined_time'] = df_user['joined_time'].apply(lambda x: fill_na_with_random(x, top_values))

# num_message, reaction_score, point
for name in ['num_message', 'reaction_score', 'point']:
    # get the top 10 most frequent values in the 'name' column
    top_values = df_user.loc[df_user[name].notnull()][name].sort_values(ascending=False).head(100)
    # fill missing values with one of the top 50 values
    df_user[name] = df_user[name].apply(lambda x: fill_na_with_random(x, top_values))

df_user.dropna(axis=0, inplace=True)


In [80]:
df_user.head()

Unnamed: 0,id,name,level,profile_link,joined_time,num_message,reaction_score,point
0,1736979,Quantum,Senior Member,https://voz.vn/u/quantum.1736979/,2006-09-19 01:07:04+00:00,19932.0,22105.0,113.0
1,1428951,HuyRongDen,Senior Member,https://voz.vn/u/huyrongden.1428951/,2006-12-13 13:20:52+00:00,13216.0,22105.0,113.0
2,1722728,thuongbui060,Senior Member,https://voz.vn/u/thuongbui060.1722728/,2006-11-14 04:30:16+00:00,6363.0,7484.0,113.0
3,1698406,Fujifilm XT3,Senior Member,https://voz.vn/u/fujifilm-xt3.1698406/,2006-11-14 04:30:16+00:00,5880.0,5147.0,113.0
4,1476978,Mr_X_f33,Senior Member,https://voz.vn/u/mr_x_f33.1476978/,2015-01-24 06:48:24+00:00,2165.0,2711.0,113.0


In [None]:
df_user.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9328 entries, 0 to 9328
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              9328 non-null   int64  
 1   name            9328 non-null   object 
 2   level           9328 non-null   object 
 3   profile_link    9328 non-null   object 
 4   joined_time     9328 non-null   object 
 5   num_message     9328 non-null   float64
 6   reaction_score  9328 non-null   float64
 7   point           9328 non-null   float64
dtypes: float64(3), int64(1), object(4)
memory usage: 655.9+ KB


In [81]:
df_user.to_csv('dataset/users.csv', index=False)

In [155]:
# joined_time
df_user['joined_time'] = df_user['joined_time'].apply(lambda x: int(pd.to_datetime(x).timestamp()))

# name
df_user['name'] = df_user['name'].apply(lambda x: x.lower())

# level
df_user['level'] = df_user['level'].apply(lambda x: x.lower())


In [158]:
df_user.head()

Unnamed: 0,id,name,level,profile_link,joined_time,num_message,reaction_score,point
0,1736979,quantum,senior member,https://voz.vn/u/quantum.1736979/,1158628024,19932.0,22105.0,113.0
1,1428951,huyrongden,senior member,https://voz.vn/u/huyrongden.1428951/,1166016052,13216.0,22105.0,113.0
2,1722728,thuongbui060,senior member,https://voz.vn/u/thuongbui060.1722728/,1163478616,6363.0,7484.0,113.0
3,1698406,fujifilm xt3,senior member,https://voz.vn/u/fujifilm-xt3.1698406/,1163478616,5880.0,5147.0,113.0
4,1476978,mr_x_f33,senior member,https://voz.vn/u/mr_x_f33.1476978/,1422082104,2165.0,2711.0,113.0


In [159]:
df_user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9328 entries, 0 to 9327
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              9328 non-null   int64  
 1   name            9328 non-null   object 
 2   level           9328 non-null   object 
 3   profile_link    9328 non-null   object 
 4   joined_time     9328 non-null   int64  
 5   num_message     9328 non-null   float64
 6   reaction_score  9328 non-null   float64
 7   point           9328 non-null   float64
dtypes: float64(3), int64(2), object(3)
memory usage: 583.1+ KB


## 2. POSTS DATA

In [141]:
df_posts = pd.read_csv('./dataset/posts.csv')
df_posts.head()

Unnamed: 0,Post_id,User_id,Title,Post_time,Replies,Views
0,https://voz.vn/t/bitcoin-vuot-moc-60-000-usd-t...,1736979,"\n\t\t\t\t\tBitcoin vượt mốc 60.000 USD, tiến ...",2024-02-28T23:36:06+0700,623,78K
1,https://voz.vn/t/canh-bao-tuyet-%C4%91oi-khong...,2020115,\n\t\t\t\t\t[Cảnh báo] Tuyệt đối không liên hệ...,2024-07-17T11:55:20+0700,0,2K
2,https://voz.vn/t/noi-quy-box-tien-%C4%91ien-tu...,268,\n\t\t\t\t\tNội quy box Tiền điện tử - Đọc kỹ ...,2023-05-26T13:43:15+0700,0,6K
3,https://voz.vn/t/luc-nay-khong-muc-bitcoin-con...,1275125,\n\t\t\t\t\tLúc này không múc Bitcoin còn đợi ...,2023-06-15T20:27:27+0700,88K,5M
4,https://voz.vn/t/tong-hop-keo-%C4%91ao-coin-tr...,1630609,\n\t\t\t\t\tTổng hợp kèo đào coin trên Telegra...,2024-03-17T16:20:34+0700,195,19K


In [144]:
# views
views = df_posts['Views'].astype(str)
for i, view in enumerate(views):
    if re.findall('K', view):
        views[i] = view[:-1] + str('000')
    elif re.findall('M', view):
        views[i] = view[:-1] + str('000000')
    elif not view.isdigit():
        digits = re.findall(r'\d+', view) 
        views[i] = ''.join(digits) if digits else None # Tìm tất cả các nhóm số
df_posts['Views'] = pd.to_numeric(views, downcast="integer")


# replies
replies = df_posts['Replies'].astype(str)
for i, reply in enumerate(replies):
    if re.findall('K', reply):
        replies[i] = reply[:-1] + str('000')
    elif re.findall('M', reply):
        replies[i] = reply[:-1] + str('000000')
    elif not reply.isdigit():
        digits = re.findall(r'\d+', reply) 
        replies[i] = ''.join(digits) if digits else None # Tìm tất cả các nhóm số
df_posts['Replies'] = pd.to_numeric(replies, downcast="integer")

# title
titles = df_posts['Title'].astype(str)
for i, title in enumerate(titles):
    titles[i] = title.strip()
df_posts['Title'] = titles

# id 
post_id = df_posts['Post_id'].astype(str)
for i, id in enumerate(post_id):
    post_id[i] = id.split('.')[-1][:-1]
df_posts['Post_id'] = pd.to_numeric(post_id, downcast="integer")

# post_time
list_time = df_posts['Post_time'].astype(str)
for i, time in enumerate(list_time):
    time = pd.to_datetime(time).timestamp()
    list_time[i] = time
list_time=list_time.astype(int)
df_posts['Post_time'] = list_time

# rename
df_posts.columns = [name.lower() for name in df_posts.columns]

In [146]:
from underthesea import word_tokenize
import re

def load_abbreviations_from_file(file_path):
    """Read abbreviations from a file and return a dictionary."""
    abbreviations = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            abbr, full = line.strip().split(': ')
            abbreviations[abbr] = full
    return abbreviations

def expand_abbreviations(text):
    """Replace abbreviations in the text with their full forms."""
    abbreviations = load_abbreviations_from_file('vietnamese_abbreviations.txt')
    for abbr, full in abbreviations.items():
        text = text.replace(f" {abbr} ", f" {full} ")  # Replace
    return text.strip()  # Remove extra whitespace

def preprocessing_text(text):
    # Step 1: Normalize the text
    text = re.sub(r'https://\S+', '', text)  # Remove HTML links
    text = re.sub(r'Click to expand\.\.\.', '', text)  # Remove the phrase 'Click to expand...' from the text.
    text = re.sub(r'via theNEXTvoz for iPhone', '', text, flags=re.IGNORECASE)  # Remove the phrase 'via theNEXTvoz for iPhone' from the text.
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    text = expand_abbreviations(text)  # Replace abbreviations with full words

    # Step 2: Tokenize the text
    words = word_tokenize(text, format="text").split()  # Tokenize words

    # Step 3: Remove stopwords
    stopwords = []
    with open('vietnamese-stopwords.txt', 'r', encoding='utf-8') as file:
        stopwords = set(file.read().splitlines())  # Read each line and store in a set
    words = [word for word in words if word not in stopwords]  # Remove stopwords

    return words

# title
list_title = df_posts['title'].astype(str).to_list()
for i, title in enumerate(list_title):
    title = preprocessing_text(title)
    list_title[i] = title
df_posts['title'] = list_title

# drop missing value
df_posts.dropna(axis=0)

In [150]:
df_posts.head()

Unnamed: 0,post_id,user_id,title,post_time,replies,views
0,926433,1736979,"[bitcoin, mốc, 60000, usd, tiến, kỷ_lục]",1709138166,623.0,78000.0
1,986421,2020115,"[cảnh_báo, tuyệt_đối, liên_hệ, giao_dịch, tran...",1721192120,0.0,2000.0
2,780221,268,"[nội_quy_box, tiền, điện_tử, đọc, kỹ, hoạt_độn...",1685083395,0.0,6000.0
3,792751,1275125,"[múc, bitcoin, đợi, lúc_nào]",1686835647,88000.0,5000000.0
4,934397,1630609,"[tổng_hợp, kèo, đào, coin, telegram, uy_tín, h...",1710667234,195.0,19000.0


In [160]:
df_posts.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6762 entries, 0 to 6762
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   post_id    6762 non-null   int32  
 1   user_id    6762 non-null   int64  
 2   title      6762 non-null   object 
 3   post_time  6762 non-null   int64  
 4   replies    6762 non-null   float64
 5   views      6762 non-null   float64
dtypes: float64(2), int32(1), int64(2), object(1)
memory usage: 343.4+ KB


## COMMENTS DATA

In [96]:
df_comments = pd.read_csv('dataset/comments.csv')

# rename
df_comments.columns = [name.lower() for name in df_comments.columns]
df_comments.head()

Unnamed: 0,comment_id,user_id,post_id,comment,comment_time
0,30528120,1736979,926433,Giá Bitcoin đã vượt mốc 60.000 USD trong ngày ...,2024-02-28T23:36:06+0700
1,30528126,1736979,926433,"Các tài sản đua nhau phá ATH, tiền thành giấy ...",2024-02-28T23:36:46+0700
2,30528130,1428951,926433,Ai hold giờ hưởng trái ngọt xả thôi .\n\nvia t...,2024-02-28T23:37:17+0700
3,30528131,1722728,926433,Vãi thật lên khiếp,2024-02-28T23:37:21+0700
4,30528142,1698406,926433,"khả năng đợt này lên 100k thật, ae voz all in ...",2024-02-28T23:38:23+0700


In [97]:
df_comments

Unnamed: 0,comment_id,user_id,post_id,comment,comment_time
0,30528120,1736979,926433,Giá Bitcoin đã vượt mốc 60.000 USD trong ngày ...,2024-02-28T23:36:06+0700
1,30528126,1736979,926433,"Các tài sản đua nhau phá ATH, tiền thành giấy ...",2024-02-28T23:36:46+0700
2,30528130,1428951,926433,Ai hold giờ hưởng trái ngọt xả thôi .\n\nvia t...,2024-02-28T23:37:17+0700
3,30528131,1722728,926433,Vãi thật lên khiếp,2024-02-28T23:37:21+0700
4,30528142,1698406,926433,"khả năng đợt này lên 100k thật, ae voz all in ...",2024-02-28T23:38:23+0700
...,...,...,...,...,...
222679,4329959,1378654,144100,"Bỏ đi, càng làm càng giảm point, bữa ráng làm ...",2020-09-29T09:43:26+0700
222680,4281295,1474252,142650,https://decrypt.co/43015/cryptocurrency-exchan...,2020-09-27T00:30:06+0700
222681,4281981,1396527,142650,Ăn xin 4.0 à\n\n\n\n\n\nvia theNEXTvoz for iPhone,2020-09-27T01:48:22+0700
222682,4287251,1474252,142650,DK.iceiceice said:\n\n\n\n\t\t\tĂn xin 4.0 à\n...,2020-09-27T11:47:40+0700


In [None]:
# comment time
list_time = df_comments['comment_time'].astype(str)
for i, time in enumerate(list_time):
    time = pd.to_datetime(time).timestamp()
    list_time[i] = time
list_time=list_time.astype(int)
df_comments['comment_time'] = list_time

In [101]:
comments = df_comments['comment'].astype(str).to_list()
for i, comment in enumerate(comments):
    comment = preprocessing_text(comment)
    comments[i] = comment
df_comments['comment'] = comments

In [None]:
df_comments

Unnamed: 0,comment_id,user_id,post_id,comment,comment_time
0,30528120,1736979,926433,"[giá, bitcoin, mốc, 60000, usd, 282, tiến, thờ...",1709138166
1,30528126,1736979,926433,"[tài_sản, đua, phá, ath, tiền, thành, giấy_lộn]",1709138206
2,30528130,1428951,926433,"[hold, hưởng, trái, xả]",1709138237
3,30528131,1722728,926433,"[vãi, khiếp]",1709138241
4,30528142,1698406,926433,"[khả_năng, đợt, 100, k, anh_em, voz, all, in, ...",1709138303
...,...,...,...,...,...
222679,4329959,1378654,144100,"[đi, point, bữa, ráng, thẻ, 50, k, đổi, card, ...",1601347406
222680,4281295,1474252,142650,"[hold, long, term, coin, sàn, coin, đợt, havin...",1601141406
222681,4281981,1396527,142650,"[ăn_xin, 40]",1601146102
222682,4287251,1474252,142650,"[dkiceiceice, said, ăn_xin, 40, buồn_phiền]",1601182060


In [161]:
df_comments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 222684 entries, 0 to 222683
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   comment_id    222684 non-null  int64 
 1   user_id       222684 non-null  int64 
 2   post_id       222684 non-null  int64 
 3   comment       222684 non-null  object
 4   comment_time  222684 non-null  int64 
dtypes: int64(4), object(1)
memory usage: 8.5+ MB


## Save DataFrame to CSV

In [163]:
df_user.to_csv("dataset/users.csv", index=False, encoding='utf-8')
df_posts.to_csv("dataset/posts.csv", index=False, encoding='utf-8')
df_comments.to_csv("dataset/comments.csv", index=False, encoding='utf-8')