In [64]:
import csv
import nltk
nltk.download("stopwords")

import pandas as pd
import spacy
spacy.cli.download("en_core_web_sm")

import stanza
stanza.download('en', processors='tokenize,lemma')

from gensim.utils import simple_preprocess
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /home/jean/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.1.0.json: 122kB [00:00, 42.0MB/s]                    
2020-11-22 21:10:23 INFO: Downloading these customized packages for language: en (English)...
| Processor | Package |
-----------------------
| tokenize  | ewt     |
| lemma     | ewt     |

2020-11-22 21:10:23 INFO: File exists: /home/jean/stanza_resources/en/tokenize/ewt.pt.
2020-11-22 21:10:23 INFO: File exists: /home/jean/stanza_resources/en/lemma/ewt.pt.
2020-11-22 21:10:23 INFO: Finished downloading models and saved to /home/jean/stanza_resources.


[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


## Load dataset

In [59]:
dataset_file_path = "../../datasets/cbc-news-coronavirus/data.csv"

columns = ["title", "description", "text"]

n_docs = 50

raw_corpus = []
with open(dataset_file_path, "r") as dataset_file:
    reader = csv.reader(dataset_file)
    
    row_number = 0
    for row in reader:
        raw_corpus.append([row[2], row[4], row[5]])
        
        if row_number >= n_docs:
            break
            
        row_number += 1

# title=2, description=4, text=5 
print(raw_corpus[0])

corpus = pd.DataFrame(raw_corpus, columns=columns)

['title', 'description', 'text']


## Stop Words Removal and Lemmatization (spacy)

In [60]:
spacy_nlp_pipeline = spacy.load("en_core_web_sm")

spacy_corpus = []
for column in columns:
    idx = 0
    for document in spacy_nlp_pipeline.pipe(corpus[column]):
        if idx >= len(spacy_corpus) or len(spacy_corpus) == 0:
            spacy_corpus.append([])
        
        text = " ".join([word.lemma_ for word in document if not word.is_stop and not word.is_digit])
        
        spacy_corpus[idx].append(simple_preprocess(text, deacc=True))
        
        idx += 1
    
spacy_corpus = pd.DataFrame(spacy_corpus, columns=columns)
print(raw_corpus[0])
print(spacy_corpus.loc[0])

['title', 'description', 'text']
title                [title]
description    [description]
text                  [text]
Name: 0, dtype: object


## Stop Words Removal and Lemmatization (stanza)

In [63]:
stanza_nlp_pipeline = stanza.Pipeline("en", processors="tokenize,lemma")
stopwords = stopwords.words("english")

stanza_corpus = []
for column in columns:
    idx = 0
    for document in corpus[column]:
        if idx >= len(stanza_corpus) or len(stanza_corpus) == 0:
            stanza_corpus.append([])
        
        preprocessed_text = " ".join(simple_preprocess(document, min_len=3))
        
        tokens = []
        for sentence in stanza_nlp_pipeline(preprocessed_text).sentences:
            for word in sentence.words:
                if word.lemma not in stopwords and word.text not in stopwords:
                    tokens.append(word.lemma)
        
        stanza_corpus[idx].append(tokens)
        
        idx += 1
    
stanza_corpus = pd.DataFrame(stanza_corpus, columns=columns)
print(raw_corpus[0])
print(stanza_corpus.loc[0])

2020-11-22 21:07:29 INFO: Loading these models for language: en (English):
| Processor | Package |
-----------------------
| tokenize  | ewt     |
| lemma     | ewt     |

2020-11-22 21:07:29 INFO: Use device: cpu
2020-11-22 21:07:29 INFO: Loading: tokenize
2020-11-22 21:07:29 INFO: Loading: lemma
2020-11-22 21:07:29 INFO: Done loading processors!


['title', 'description', 'text']
title                [title]
description    [description]
text                  [text]
Name: 0, dtype: object


## Comparison

In [78]:
import random

# print 3 random titles
for _ in range(10):
    idx = random.randint(0, 50)
    print(raw_corpus[idx][0])
    print(spacy_corpus.loc[idx,"title"])
    print(stanza_corpus.loc[idx,"title"])
    print()

Sea turtle baby boom hatches amid coronavirus lockdown
['sea', 'turtle', 'baby', 'boom', 'hatch', 'amid', 'coronavirus', 'lockdown']
['sea', 'turtle', 'baby', 'boom', 'hatches', 'amid', 'coronavirus', 'lockdow']

Wuhan health officials raise death toll linked to COVID-19 by 50%
['wuhan', 'health', 'official', 'raise', 'death', 'toll', 'link', 'covid']
['wuhan', 'health', 'official', 'raise', 'death', 'toll', 'link', 'covid']

Did the WHO mishandle the global coronavirus pandemic?
['mishandle', 'global', 'coronavirus', 'pandemic']
['mishandle', 'global', 'coronavirus', 'pandemic']

The latest on the coronavirus outbreak for April 27
['late', 'coronavirus', 'outbreak', 'april']
['latest', 'coronavirus', 'outbreak', 'april']

China angered by Australian call for international inquiry into coronavirus origin
['china', 'anger', 'australian', 'international', 'inquiry', 'coronavirus', 'origin']
['china', 'angered', 'australian', 'call', 'international', 'inquiry', 'coronavirus', 'origin']

Q