In [1]:
import json

from ast import literal_eval
from pathlib import Path
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
import pyarrow as pa

from tqdm import tqdm

tqdm.pandas()

In [2]:
DATASET_DIR = "../dataset"
DATASET_NAME = "medium_articles.csv"
DATASET_PATH = Path(DATASET_DIR) / DATASET_NAME

assert Path(DATASET_PATH).exists(), "Dataset not found."

df = pd.read_csv(DATASET_PATH)

In [3]:
df.head()

Unnamed: 0,title,text,url,authors,timestamp,tags
0,Mental Note Vol. 24,Photo by Josh Riemer on Unsplash\n\nMerry Chri...,https://medium.com/invisible-illness/mental-no...,['Ryan Fan'],2020-12-26 03:38:10.479000+00:00,"['Mental Health', 'Health', 'Psychology', 'Sci..."
1,Your Brain On Coronavirus,Your Brain On Coronavirus\n\nA guide to the cu...,https://medium.com/age-of-awareness/how-the-pa...,['Simon Spichak'],2020-09-23 22:10:17.126000+00:00,"['Mental Health', 'Coronavirus', 'Science', 'P..."
2,Mind Your Nose,Mind Your Nose\n\nHow smell training can chang...,https://medium.com/neodotlife/mind-your-nose-f...,[],2020-10-10 20:17:37.132000+00:00,"['Biotechnology', 'Neuroscience', 'Brain', 'We..."
3,The 4 Purposes of Dreams,Passionate about the synergy between science a...,https://medium.com/science-for-real/the-4-purp...,['Eshan Samaranayake'],2020-12-21 16:05:19.524000+00:00,"['Health', 'Neuroscience', 'Mental Health', 'P..."
4,Surviving a Rod Through the Head,"You’ve heard of him, haven’t you? Phineas Gage...",https://medium.com/live-your-life-on-purpose/s...,['Rishav Sinha'],2020-02-26 00:01:01.576000+00:00,"['Brain', 'Health', 'Development', 'Psychology..."


In [4]:
df.count()

title        192363
text         192368
url          192368
authors      192368
timestamp    192366
tags         192368
dtype: int64

In [5]:
df.columns

Index(['title', 'text', 'url', 'authors', 'timestamp', 'tags'], dtype='object')

In [6]:
tags_df = df.tags.progress_apply(literal_eval)
tags_df.head()

100%|████████████████████████████████| 192368/192368 [00:02<00:00, 83174.52it/s]


0    [Mental Health, Health, Psychology, Science, N...
1    [Mental Health, Coronavirus, Science, Psycholo...
2    [Biotechnology, Neuroscience, Brain, Wellness,...
3    [Health, Neuroscience, Mental Health, Psycholo...
4    [Brain, Health, Development, Psychology, Science]
Name: tags, dtype: object

In [7]:
labels = set()

tags_df.progress_apply(
    lambda x: labels.update(x)
)

len(labels)

100%|███████████████████████████████| 192368/192368 [00:00<00:00, 985983.32it/s]


78638

In [8]:
label_vals = list(filter(bool, labels))
label_df = pd.DataFrame(
    {
        "index": range(1, len(label_vals) + 1),
        "label": label_vals
    },
)
label_df = label_df[~label_df["label"].str.lower().duplicated()]
label_df = pd.DataFrame(
    {
        "index": range(1, label_df["label"].count() + 1),
        "label": label_df["label"].tolist()
    },
)

In [9]:
label_dataset_path = Path(DATASET_DIR) / "labels.parquet"
label_df.to_parquet(label_dataset_path)

In [10]:
label_df = pd.read_parquet(label_dataset_path)
label_df[label_df["index"] == 1].label

0    Apache Httpd
Name: label, dtype: object

In [11]:
import torch

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [12]:
label_vals[-100:]

['Eytech',
 'Used Cars For Sale',
 'Connector User',
 'Deep Dives',
 'Uncomfortable Joy',
 '100dayproject',
 'Impotence',
 'Pearson',
 'Discount Vouchers',
 'Facebook Notes',
 'Fluttering Up',
 'Management Tips',
 'Unhcr',
 'Discipleship',
 'Cancer Survivor',
 'Apprenticeship',
 'Voices',
 'Eventfiringwebdriver',
 'Value Time Matrix',
 'Wound Care Biologics',
 'Grades',
 'MVP',
 'Things I Wish I Knew',
 'Watertech',
 'Preterm Birth',
 'Timothy Patrick Lloyd',
 '盆栽',
 'New Hampshire',
 'Lazy',
 'Perceptron Algorithm',
 'License Compliance',
 'Screen Recording',
 'Onlineshoppinginpakistan',
 'Creative Economy',
 'B2b Lead Generation Usa',
 'Murder For Hire',
 'Ugly Christmas',
 'Ideasworthsharing',
 'Accordions',
 'Markets In Action',
 'True Wealth',
 'Angular Ui Design',
 'Crypto Wallet',
 'New Tech Startup',
 'Underserved Communities',
 'Oversensitivity',
 'Mandy Patinkin',
 'Best Security System',
 'Vacations',
 'Pwa',
 'Tech Awards',
 'Package Manager',
 'Third Way',
 'Borderline Per

In [13]:
import torch

from transformers import AutoTokenizer, AutoModelForSequenceClassification

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained("papluca/xlm-roberta-base-language-detection")

model = AutoModelForSequenceClassification.from_pretrained("papluca/xlm-roberta-base-language-detection")

model.to(device)


XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0): XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=Tr

In [14]:
inputs = tokenizer("Trade S", return_tensors="pt")
for k, v in inputs.items():
    inputs[k] = v.to(device)
outputs = model(**inputs)

In [15]:
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-1.0156,  0.6120, -1.2223, -0.5868, -0.3044,  0.3479,  1.4802,  0.4988,
         -1.1956,  0.7597, -0.3993,  2.8324, -0.3252,  1.8733, -1.2812, -1.2442,
         -0.1646, -0.3289,  0.6719, -0.9912]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [16]:
from transformers import TextClassificationPipeline

# mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
# device = torch.device("cpu")

pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=device)

In [17]:
pipe("Mental Health")

[{'label': 'en', 'score': 0.8340853452682495}]

In [18]:
# text_list = list(map(lambda s: s[:514], df["text"].iloc[-100:].tolist()))
# pipe(text_list)

In [19]:
from typing import List
from torch.utils.data import Dataset
from tqdm.auto import tqdm

en_labels = []

class EnLabelDataset(Dataset):
    def __init__(self, data: List[str]) -> None:
        self.data = data
        
    def __len__(self) -> int:
        return len(self.data)

    def __getitem__(self, idx: int) -> str:
        return self.data[idx]

dataset = EnLabelDataset(label_vals)

for out in tqdm(pipe(dataset, batch_size=1000), total=len(dataset)):
    en_labels.append(out)

  0%|          | 0/78637 [00:00<?, ?it/s]

In [20]:
label_df = pd.DataFrame(en_labels).rename(
    columns={"label": "lang_label", "score": "lang_score"},
).assign(
    label=label_vals,
    index=range(1, len(label_vals) + 1),
)

In [21]:
label_dataset_path = Path(DATASET_DIR) / "en_labels.parquet"
label_df.to_parquet(label_dataset_path)

In [22]:
label_df

Unnamed: 0,lang_label,lang_score,label,index
0,hi,0.718276,Apache Httpd,1
1,en,0.929937,Over 50,2
2,en,0.925368,Air Fryer,3
3,pt,0.526477,Adar Poonawalla,4
4,en,0.621760,Macabre,5
...,...,...,...,...
78632,en,0.964156,Coffee Freshness,78633
78633,en,0.992965,Conscious Consumption,78634
78634,tr,0.372506,Enterprise Erp,78635
78635,en,0.990927,Wearables,78636


In [23]:
result = []

In [24]:
import re
from bs4 import BeautifulSoup

def clean_text(text):
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text()
    text = re.sub(r'https?://\S+', ' ', text) # 去除網址
    text = re.sub(r'[^a-zA-Z0-9 .,:;\'\"\(\)\[\]\{\}]', ' ', text) # 去除非字母、非數字和非空白字符
    return text

df["text"] = df["text"].progress_apply(clean_text)

100%|████████████████████████████████| 192368/192368 [00:18<00:00, 10405.53it/s]


In [25]:
df.head()

Unnamed: 0,title,text,url,authors,timestamp,tags
0,Mental Note Vol. 24,Photo by Josh Riemer on Unsplash Merry Christ...,https://medium.com/invisible-illness/mental-no...,['Ryan Fan'],2020-12-26 03:38:10.479000+00:00,"['Mental Health', 'Health', 'Psychology', 'Sci..."
1,Your Brain On Coronavirus,Your Brain On Coronavirus A guide to the curi...,https://medium.com/age-of-awareness/how-the-pa...,['Simon Spichak'],2020-09-23 22:10:17.126000+00:00,"['Mental Health', 'Coronavirus', 'Science', 'P..."
2,Mind Your Nose,Mind Your Nose How smell training can change ...,https://medium.com/neodotlife/mind-your-nose-f...,[],2020-10-10 20:17:37.132000+00:00,"['Biotechnology', 'Neuroscience', 'Brain', 'We..."
3,The 4 Purposes of Dreams,Passionate about the synergy between science a...,https://medium.com/science-for-real/the-4-purp...,['Eshan Samaranayake'],2020-12-21 16:05:19.524000+00:00,"['Health', 'Neuroscience', 'Mental Health', 'P..."
4,Surviving a Rod Through the Head,"You ve heard of him, haven t you Phineas Gage...",https://medium.com/live-your-life-on-purpose/s...,['Rishav Sinha'],2020-02-26 00:01:01.576000+00:00,"['Brain', 'Health', 'Development', 'Psychology..."


In [26]:
from typing import List
from torch.utils.data import Dataset
from tqdm.auto import tqdm


class SentenceDataset(Dataset):
    def __init__(self, dataframe: pd.DataFrame) -> None:
        self.data = dataframe["text"]
        
    def __len__(self) -> int:
        return self.data.count()

    def __getitem__(self, idx: int) -> str:
        return self.data.iloc[idx][:514]

dataset = SentenceDataset(df)

for out in tqdm(pipe(dataset, batch_size=100), total=len(dataset)):
    result.append(out)


  0%|          | 0/192368 [00:00<?, ?it/s]

In [32]:
result_df = pd.DataFrame(result)

In [33]:
result_df

Unnamed: 0,label,score
0,en,0.964118
1,en,0.962439
2,en,0.957879
3,en,0.915634
4,en,0.914334
...,...,...
192363,en,0.955198
192364,en,0.993724
192365,en,0.984959
192366,en,0.978811


In [34]:
concat_df = pd.concat([df, result_df], axis=1)

In [35]:
new_dataset_path = Path(DATASET_DIR) / "medium_articles.parquet"
concat_df.to_parquet(new_dataset_path)

In [36]:
concat_df.iloc[1000]["text"]

'Every teacher and professor had their share of strain brought by the ongoing COVID 19 pandemic.  Through listening, adaptation, and planning, we made it to the finish line with quite a bit of energy with my students this semester.  There was one key decision that helped me have a very smooth remote teaching experience: preparing a front loaded syllabus.  I made that choice consciously, but of course without a prediction about the implications of the then developing pandemic. I aimed to empower students with a broad spectrum of tools early in the semester and use the rest of the time for individual project development.  This strategy ended up working well for remote teaching.  Shaping a classroom through Agency, Adaptation, and Tooling  Over the years, I observed that front loaded classes and workshops fared much better in many terms. Combined with a longer exploration process, in the end, they turned into a teaching formula for achieving high yield, high quality results.  I prepared t