In [18]:
from tqdm import tqdm
import json
import os
import numpy as np
import pandas as pd

# Section 1: Method 1 Split Text

In [18]:
import ray
ray.init()

2025-02-01 15:01:30,956	INFO worker.py:1777 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8267 [39m[22m


0,1
Python version:,3.11.7
Ray version:,2.37.0
Dashboard:,http://127.0.0.1:8267


In [None]:
if os.getcwd() == '/root':
    new_path = "/root/0_Thesis/0_final/"
    os.chdir(new_path)
else:
    os.chdir("..") 
print(os.getcwd())

In [None]:
PATH = "data/url/eng"
owilix_ds = []
for root, _, files in os.walk(PATH):
    for file in files:
        if file.endswith(".json"):
            file_path = os.path.join(root, file)
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    owilix_ds.extend(data)
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON file {file_path}: {e}")
            except Exception as e:
                print(f"Error reading file {file_path}: {e}")

In [25]:
owilix_df = pd.DataFrame(owilix_ds, columns=['url', 'plain_text'])
print(owilix_df.head(2))
print(owilix_df.shape)

                                                 url  \
0  https://www.panicposters.com/products/james-bo...   
1             https://chha-bc.org/forums/reply/40113   

                                          plain_text  
0  Skip to content\n  • Home\n  • About Us\n  • S...  
1  Hearing accessibility is our passion and we ar...  
(2630644, 2)


In [6]:
def process_text(value):
    text = value['plain_text']
    url = value['url']
    texts = text.split("\n\n")
    unique_texts = np.unique(texts)
    unique_texts = [comment.strip() for comment in unique_texts]
    return_texts = []
    for text in unique_texts:
        if "/" not in text:
            if len(text) >= 50 and len(text) <= 200 and text.count("\n") <= 4:
                return_texts.append(text)
    
    return [return_texts, url]

In [7]:
@ray.remote
def process_owilix_df(df):
    cmts  = df.apply(process_text, axis=1)
    cmt_url = []
    for cmtl in cmts:
        for cmt in cmtl[0]:
            cmt_url.append([cmt, cmtl[1]])
    return cmt_url

In [8]:
batch_size_big = 100000
batch_size_small = 5000
futures = []
cmt_url = []
for index in range(0, int(len(owilix_df)/batch_size_big) + 1): 
    owilix_small_1 =  owilix_df.iloc[index * batch_size_big: (index+1)*batch_size_big] 
    for index2 in range(0, int(len(owilix_small_1)/batch_size_small) + 1):
        owilix_small = owilix_small_1.iloc[index2 * batch_size_small: (index2+1)*batch_size_small] 
        futures.append(process_owilix_df.remote(owilix_small))
    while futures:
        done, futures = ray.wait(futures, num_returns=1, timeout=1)
        for future in done:
            results = ray.get(future)
            cmt_url.extend(results)

In [9]:
cmts_df = pd.DataFrame(cmt_url, columns=['text', 'url'])
cmts_df.shape

(20426869, 2)

In [10]:
cmts_df = cmts_df.drop_duplicates('text')
cmts_df.shape

(8191439, 2)

In [11]:
cmts_df.iloc[0]

text    Informationen über den Umgang mit deinen persö...
url     https://ziviforum.com/phpbb/ucp.php?mode=regis...
Name: 0, dtype: object

In [None]:
# cmts_df.to_csv("eng_raw_owilix.csv")
# cmts_df.to_csv("vie_raw_owilix.csv")
# cmts_df.to_csv("deu_raw_owilix.csv")


# Section 2: Method 2: scheme.org

In [2]:
df = pd.read_csv("vie_raw_owilix.csv")

In [31]:
df.shape

(1177527, 3)

In [3]:
df = df.drop_duplicates('url')
df.shape

(122761, 3)

In [4]:
urls = df["url"].tolist()

In [8]:
import requests
from bs4 import BeautifulSoup
from extruct.jsonld import JsonLdExtractor
import signal

types = [
    "comment",
    "blogposting",
    "discussionforumposting",
    "socialmediaposting",
    "review",
    "question",
    "answer",
    "newsarticle",
    "userfeedback",
    "reply",
    "post",
    "tweet",
    "statusupdate",
    "creativework",
    "webpage"
]
def extract_text_fields(data):
    text_fields = []
    def traverse_json(obj):
        if isinstance(obj, dict):
            obj_type = obj.get('@type', '').lower()
            if any(t in obj_type for t in types):
                text = obj.get('text', None)
                if text:
                    # print(text)
                    text_fields.append(text)
            for key, value in obj.items():
                traverse_json(value)
        elif isinstance(obj, list):
            for item in obj:
                traverse_json(item)
    traverse_json(data)
    return text_fields

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

# @ray.remote
def get_microdata_from_url(url):
    try:
        # print(url)
        # signal.alarm(10000)
        
        response = requests.get(url, timeout = 10)
        soup = BeautifulSoup(response.content, 'html.parser')
        extractor = JsonLdExtractor()
        jsonld_data = extractor.extract(soup.prettify())
        # with requests.get(url[0], headers=headers, timeout=5, stream=True) as response:
        #     response.raise_for_status()  # Ensure we catch HTTP errors
        #     soup = BeautifulSoup(response.content, 'html.parser')  # Parse HTML
        #     extractor = JsonLdExtractor()
        #     jsonld_data = extractor.extract(str(soup))

        checked = True
        for entry in jsonld_data:
            # print(jsonld_data)
            if ('schema.org' in entry.get('context', '') or 
                'schema.org' in entry.get('@context', '')):
                checked = True
        all_text = []
        if checked:
            for entry in jsonld_data:
                all_text.extend(extract_text_fields(entry))
        return [all_text, url]
    
    except Exception as e:
        return [[], url]
#  /
    return []


In [9]:
from concurrent.futures import ThreadPoolExecutor, as_completed
cmts = []
save_index = 1
with ThreadPoolExecutor(max_workers=30) as executor:
    # Submit tasks
    future_to_url = {executor.submit(get_microdata_from_url, url): url for url in urls}
    for future in as_completed(future_to_url):
        url = future_to_url[future]
        try:
            result = future.result()
            if len(result[0]) != 0:
                print(len(cmts), end="\r", flush=True)
                cmts += result[0]
                if int(len(cmts) / 2000) == save_index:
                    save_index += 1 
                    with open("raw_cmts_vie8.json", 'w') as file:
                        json.dump(cmts, file)
        except Exception as e:
            a = 3

  k = self.parse_starttag(i)


21

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


57

  soup = BeautifulSoup(response.content, 'html.parser')
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


82

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


106

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


122

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


300

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


309

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


312

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


708

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


885

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


886

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


947

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1021

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1071

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1076

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1079

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1098

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1107

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1135

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1146

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1241

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1320

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1322

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1330

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1420

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1443

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1490

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1669

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1695

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1764

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1917

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1980

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2018

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2073

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2172

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2192

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2206

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2226

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2230

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2355

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2370

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2429

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2436

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2504

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2529

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2560

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2616

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2635

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2821

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2838

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2871

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2952

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2976

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


3046

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


3110

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


3231

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


3498

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


3510

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


3544

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


3562

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


3653

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


3862

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


4099

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


4114

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


4120

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


4156

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


4169

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


4248

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


4440

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


4637

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


4716

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


4817

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


4975

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


4992

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


4996

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


5122

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


5167

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


5277

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


5296

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


5320

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


5415

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


5510

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


5521

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


5651

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


5665

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


5827

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


5875

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


5974

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


6014

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


6039

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


6378

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


6397

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


6833

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


6887

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


7008

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


7076

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


7153

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


7164

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


7186

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


7197

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


7215

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


7296

In [13]:
cmts[1000]

'Xin hỏi cao nhân chút.\nCác phần mềm tool toy trước giờ được viết trên kiến trúc X86-64 của Intel. Việc chuyển qua PC/Laptop dùng AMD có gặp vấn đề gì không nhỉ?'

In [15]:
df = pd.DataFrame(cmts, columns=['text'])

In [17]:
df.to_csv("VIE_M2_RAW.csv", index=False)

In [13]:
with open("raw_cmts_eng8.json", 'w') as file:
    json.dump(cmts, file)

In [13]:
ray.shutdown()