In [1]:
import pandas as pd
import os
from tqdm import tqdm
from src.dtprocess import cleandt

In [28]:
# from google.colab import drive
# drive.mount('/content/drive')

In [29]:
# %cd /content/drive/My Drive/Colab Notebooks/Text_Generator

In [30]:
# %ls -la

## Transform the raw data to usable text

* Load raw data from source

In [2]:
CRAWL_FOLDER = './data/vnexpress/raw_news'
vnexpress = []

for filename in os.listdir(CRAWL_FOLDER):

    with open('./data/vnexpress/raw_news/{}'.format(filename), 'r') as file:
        news = file.readlines()
        vnexpress += cleandt.convert_dict(news, 'content')

In [3]:
news = pd.DataFrame(columns=['content','url','topic', 'sub-topic', 'image', 'title','description'])

for new in vnexpress:
    news.loc[len(news)] = pd.Series(new)
news = news.reset_index().rename(columns={'index':'article_id'})

In [4]:
processed_news = news[['article_id','content','topic','sub-topic','title','description']]
print(processed_news.shape)
processed_news.head()

raw_news = processed_news.copy()

(1347, 6)


In [5]:
def lower_case(x):
  try:
    x = x.lower()
  except Exception as ex:
    pass
  return x

for col in processed_news.select_dtypes(include='object').columns:
  processed_news[col] = processed_news[col].apply(lambda x: lower_case(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processed_news[col] = processed_news[col].apply(lambda x: lower_case(x))


* Find null values and deal with

In [6]:
processed_news.isna().sum()

article_id     0
content        0
topic          0
sub-topic      0
title          4
description    6
dtype: int64

In [7]:
processed_news.fillna('', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processed_news.fillna('', inplace=True)


* Merge columns into a single `tag` column

In [8]:
processed_news['tag'] = processed_news['content'] + processed_news['title'] + processed_news['description'] \
  + processed_news['topic'] + processed_news['sub-topic']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processed_news['tag'] = processed_news['content'] + processed_news['title'] + processed_news['description'] \


In [9]:
processed_news = processed_news.drop(columns=['content','description','title'])

In [10]:
processed_news.head()

Unnamed: 0,article_id,topic,sub-topic,tag
0,0,doi-song,to-am,người mẹ 42 tuổi ở đà nẵng vội vã tra hỏi và v...
1,1,doi-song,to-am,"vợ chồng người phụ nữ ở mỹ đức, hà nội luôn tự..."
2,2,doi-song,to-am,"kati morton, nhà trị liệu hôn nhân và gia đình..."
3,3,doi-song,to-am,"anh thắng, chủ tịch hội làm vườn tỉnh nghệ an ..."
4,4,doi-song,to-am,"vừa kết thúc môn tiếng anh, phương bị sốt cao,..."


## Tokenize the Vietnamese words

In [41]:
# !pip install pyvi

In [42]:
from pyvi import ViTokenizer

In [45]:
processed_news['tag'] = processed_news['tag'].apply(lambda x: ViTokenizer.tokenize(x))
processed_news['tag'] = processed_news['tag'].apply(lambda x: cleandt.remove_stopword(x, './src/vietnamese-stopwords.txt'))

In [56]:
processed_news

Unnamed: 0,article_id,topic,sub-topic,tag
0,0,doi-song,to-am,"muốn , cha_mẹ dạy con cách_trở_nên mạnh_mẽ . n..."
1,1,doi-song,to-am,"tháng 7 / 2022 , thư thanh , quê ở giang tô , ..."
2,2,doi-song,to-am,"trần , 49 tuổi , người điều_hành "" chung_cư cầ..."
3,3,doi-song,to-am,"gong , điều_hành một công_ty tài_chính , nảy_s..."
4,4,doi-song,to-am,bé an khuê mới sinh tháng 7 / 2021 hoàn_toàn g...
...,...,...,...,...
270,270,thoi-su,giao-thong,nội_dung đại_diện công_an tỉnh nghệ_an đưa cuộ...
271,271,thoi-su,giao-thong,"khoảng 9h , điểm giao quốc_lộ 9 cao_tốc cam_lộ..."
272,272,thoi-su,giao-thong,đường 70 đoạn nối quận hà đông huyện thanh trì...
273,273,thoi-su,giao-thong,cao_tốc bắc_nam tp hcm đến nha trang nối thông...


## Dump each tag to a text file

In [47]:
PROCESSED_FOLDER = './data/vnexpress/processed_news'

In [48]:
os.listdir(PROCESSED_FOLDER)

['doi-song', 'thoi-su']

In [49]:
def get_info(topic):
  temp = processed_news[processed_news.topic == topic]

  return temp['article_id'].to_list(), temp['tag'].to_list()

In [52]:
for topic in tqdm(os.listdir(PROCESSED_FOLDER)):
  articles_ids, tags = get_info(topic)
  # print(articles_ids)
  for id, tag in zip(articles_ids, tags):
    with open(f'{PROCESSED_FOLDER}/{topic}/{id}.txt', "w", encoding="utf-8") as file:
      file.write(tag)

100%|██████████| 2/2 [00:00<00:00, 13.51it/s]


In [11]:
raw_news.to_csv('./data/vnexpress/csv/vnexpress.csv')