In [26]:
import pandas as pd
import os
from tqdm import tqdm
import sys
import os

module_path = os.path.abspath(os.path.join('..'))
module_path
if module_path not in sys.path:
    sys.path.append(module_path+"\\src\\dtprocess")
import cleandt

In [27]:
# from google.colab import drive
# drive.mount('/content/drive')

In [28]:
# %cd /content/drive/My Drive/Colab Notebooks/Text_Generator

In [29]:
# %ls -la

## Transform the raw data to usable text

* Load raw data from source

In [30]:
CRAWL_FOLDER = '../data/vnexpress/raw_news'
vnexpress = []

for filename in os.listdir(CRAWL_FOLDER):

    with open('../data/vnexpress/raw_news/{}'.format(filename), 'r') as file:
        news = file.readlines()
        vnexpress += cleandt.convert_dict(news, 'content')

In [31]:
news = pd.DataFrame(columns=['content','url','topic', 'sub-topic', 'image', 'title','description'])

for new in vnexpress:
    news.loc[len(news)] = pd.Series(new)
news = news.reset_index().rename(columns={'index':'article_id'})

In [32]:
processed_news = news[['article_id','content','topic','sub-topic','title','description']]
print(processed_news.shape)
processed_news.head()

raw_news = processed_news.copy()

(2481, 6)


In [33]:
def lower_case(x):
  try:
    x = x.lower()
  except Exception as ex:
    pass
  return x

for col in processed_news.select_dtypes(include='object').columns:
  processed_news[col] = processed_news[col].apply(lambda x: lower_case(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processed_news[col] = processed_news[col].apply(lambda x: lower_case(x))


* Find null values and deal with

In [34]:
processed_news.isna().sum()

article_id     0
content        0
topic          0
sub-topic      0
title          2
description    7
dtype: int64

In [35]:
processed_news.fillna('', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processed_news.fillna('', inplace=True)


* Merge columns into a single `tag` column

In [36]:
processed_news['tag'] = processed_news['content'] + processed_news['title'] + processed_news['description'] \
  + processed_news['topic'] + processed_news['sub-topic']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processed_news['tag'] = processed_news['content'] + processed_news['title'] + processed_news['description'] \


In [37]:
processed_news = processed_news.drop(columns=['content','description','title'])

In [38]:
processed_news.head()

Unnamed: 0,article_id,topic,sub-topic,tag
0,0,doi-song,to-am,"""xin lỗi đi"". nhiều thế hệ đã dùng câu này để ..."
1,1,doi-song,to-am,một nghiên cứu của đại học wisconsin-madison (...
2,2,doi-song,to-am,"các yếu tố như độ tuổi của trẻ, mối quan hệ gi..."
3,3,doi-song,to-am,ở tuổi đôi mươi chị trần thị liên kết hôn với ...
4,4,doi-song,to-am,"chị hoàng thị hòa, 34 tuổi, ở hà nam lấy chồng..."


## Tokenize the Vietnamese words

In [39]:
# !pip install pyvi

In [40]:
from pyvi import ViTokenizer

In [43]:
processed_news['tag'] = processed_news['tag'].apply(lambda x: ViTokenizer.tokenize(x))
processed_news['tag'] = processed_news['tag'].apply(lambda x: cleandt.remove_stopword(x, '../src/vietnamese-stopwords.txt'))

In [44]:
processed_news

Unnamed: 0,article_id,topic,sub-topic,tag
0,0,doi-song,to-am,""" xin_lỗi đi "" . thế_hệ dùng câu xử_lý trẻ chơ..."
1,1,doi-song,to-am,một nghiên_cứu đại_học wisconsin - madison ( m...
2,2,doi-song,to-am,"yếu_tố độ tuổi trẻ , mối quan_hệ giữa cha_mẹ ,..."
3,3,doi-song,to-am,ở tuổi đôi_mươi chị trần thị liên_kết_hôn anh ...
4,4,doi-song,to-am,"chị hoàng thị_hòa , 34 tuổi , ở hà nam lấy chồ..."
...,...,...,...,...
2476,2476,the-thao,tuong-thuat,"* ghi_bàn : martens 8 ' , snoeijs 11 ' , brugt..."
2477,2477,the-thao,tuong-thuat,"ghi_bàn : encarnacao 7 ' , nazareth 21 ' . trậ..."
2478,2478,the-thao,tuong-thuat,"* ghi_bàn : smith 14 , 45 , horan 77 đối_thủ s..."
2479,2479,the-thao,tuong-thuat,"cầu_thủ hai đội khởi_động xong , ban tổ_chức t..."


## Dump each tag to a text file

In [45]:
PROCESSED_FOLDER = '../data/vnexpress/processed_news'

In [46]:
os.listdir(PROCESSED_FOLDER)

['doi-song', 'du-lich', 'giai-tri', 'giao-duc', 'khoa-hoc', 'the-thao']

In [47]:
def get_info(topic):
  temp = processed_news[processed_news.topic == topic]

  return temp['article_id'].to_list(), temp['tag'].to_list()

In [48]:
for topic in tqdm(os.listdir(PROCESSED_FOLDER)):
  articles_ids, tags = get_info(topic)
  # print(articles_ids)
  for id, tag in zip(articles_ids, tags):
    with open(f'{PROCESSED_FOLDER}/{topic}/{id}.txt', "w", encoding="utf-8") as file:
      file.write(tag)

100%|██████████| 6/6 [00:01<00:00,  4.42it/s]


In [49]:
raw_news.to_csv('../data/vnexpress/csv/vnexpress.csv')