In [None]:
%matplotlib inline
# automatically reload edited modules
%load_ext autoreload
%autoreload 2

%pwd

In [2]:
import csv
import re

skip_domains = ["siteindices.com", "cutestat.com", "clearwebstats.com"]  # List of domains to skip
csv.field_size_limit(10 * 1024 * 1024)  # Set the field size limit to 10 MB

def data_to_text_label(data):
    texts = [" ".join([doc["domain"], doc["title"], doc["keyword"], doc["url"], doc["text"]]) for doc in data]
    labels = [doc["target"] for doc in data]
    return texts, labels


def process_data(csv_reader):
    data = []
    seen_domains = set()

    for row in csv_reader:
        # ,domain,title,keyword,url,crawledDate,aliveDate,isPMS,html,text
        index = row[0]
        raw_text = row[9]
        if raw_text == "":
            continue
        
        domain = row[1]
        if any(skip_domain in domain for skip_domain in skip_domains):
            continue
        
        isPMS = row[7]
        if isPMS == "":
            continue
        target = int(isPMS)
        
        text = re.sub(r" {2,}", " ", raw_text)
        text = re.sub(r"^ {0,}[\w/-]+(?: {1,}[\w/-]+)?(?: {1,}[\w/-]+)?(?: {1,}[\w/-]+)?:(.*)\n", "", text,
                      flags=re.MULTILINE)

        title = row[2]
        keyword = row[3]
        url = row[4]
        crawledDate = row[5]
        aliveDate = row[6]

        html = row[8]

        if domain not in seen_domains:
            data.append({
                "index": index,  
                "domain": domain,
                "title": title,
                "keyword": keyword,
                "url": url,
                "crawledDate": crawledDate,
                "aliveDate": aliveDate,
                "isPMS": isPMS,
                "html": html,
                "text": text,
                "target": target
            })
            seen_domains.add(domain)
        
    return data
    



# Labeled data (manual labeled)

In [3]:
data_dict = {}

CORPUS_PATH = "data/manual_labeled.csv"

with open(CORPUS_PATH, "r", encoding="utf-8") as file:
    csv_reader = csv.reader(file)
    next(csv_reader)  # Skip the first line
    data_manual = process_data(csv_reader)
            
    
texts, labels = data_to_text_label(data_manual)

data_dict["CH"] = {"texts": texts, "labels": labels}

count_0 = labels.count(0)
count_1 = labels.count(1)
print("Count of 0:", count_0)
print("Count of 1:", count_1)
print("Total manual labeled data:", count_0 + count_1)

Count of 0: 313
Count of 1: 254
Total manual labeled data: 567


# Labeled data (manual and model-assisted labeled)

In [4]:
data_dict = {}

CORPUS_PATH = "data/manual_and_self_train.csv"

with open(CORPUS_PATH, "r", encoding="utf-8") as file:
    csv_reader = csv.reader(file)
    next(csv_reader)  # Skip the first line
    data = process_data(csv_reader)
            
    
texts, labels = data_to_text_label(data)

data_dict["CH"] = {"texts": texts, "labels": labels}

count_0 = labels.count(0)
count_1 = labels.count(1)
print("Count of 0:", count_0)
print("Count of 1:", count_1)
print("Total labeled data:", count_0 + count_1)


Count of 0: 2965
Count of 1: 2794
Total labeled data: 5759


# All data

In [5]:
data_dict = {}

CORPUS_PATH = "data/all_data.csv"

data_all = []
seen_domains = set()

with open(CORPUS_PATH, "r", encoding="utf-8") as file:
    csv_reader = csv.reader(file)
    next(csv_reader)  # Skip the first line
    for row in csv_reader:
        # ,domain,title,keyword,url,crawledDate,aliveDate,isPMS,html,text
        if ",domain,title" in row:
            continue
        if len(row) < 10:
            continue
        index = row[0]
        raw_text = row[9]
        if raw_text == "":
            continue

        domain = row[1]
        if any(skip_domain in domain for skip_domain in skip_domains):
            continue

            
        isPMS = row[7]
        # if isPMS == "":
        #     continue
        target = isPMS
        
        text = re.sub(r" {2,}", " ", raw_text)
        text = re.sub(r"^ {0,}[\w/-]+(?: {1,}[\w/-]+)?(?: {1,}[\w/-]+)?(?: {1,}[\w/-]+)?:(.*)\n", "", text,
                      flags=re.MULTILINE)
            
        title = row[2]
        keyword = row[3]
        url = row[4]
        crawledDate = row[5]
        aliveDate = row[6]

        html = row[8]

            
        if domain not in seen_domains:
            data_all.append({
                "index": index,  
                "domain": domain,
                "title": title,
                "keyword": keyword,
                "url": url,
                "crawledDate": crawledDate,
                "aliveDate": aliveDate,
                "isPMS": isPMS,
                "html": html,
                "text": text,
                "target": target
            })
            seen_domains.add(domain)
            

print("Total data:", len(data_all))

Total data: 14254


**Disclaimer**: The data is collected from the internet and may contain inappropriate content. The data is used for research purposes only. The data is not suitable for commercial use. Please use the data at your own discretion.