# Task 2: Preprocessing for a chosen news

In [None]:
import sparknlp 

spark = sparknlp.start()

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  3.3.4
Apache Spark version:  3.0.3


In [None]:
from sparknlp.pretrained import PretrainedPipeline
from sparknlp.base import *
from sparknlp.annotator import *

## 1. Remove all company names shown in the article you have chosen 
(Please do not use hardcode to specify the company name)

In [None]:
# using pretrained Spark NLP pipeline to help recognize company names
pipeline = PretrainedPipeline('onto_recognize_entities_sm')

onto_recognize_entities_sm download started this may take some time.
Approx size to download 160.1 MB
[OK!]


In [None]:
# load the chosen article for pre-processing
f = open("forbes/outputs/Chinese AI Giant SenseTime Reopens $770 Million IPO One Week After U.S. Blacklist.txt", encoding="utf8")
text = f.read()

# show text of txt file
text


'Visitors check out the booth belonging to SenseTime at the 2021 World Artificial Intelligence  Conference in Shanghai on July 7, 2021.Chinese artificial intelligence giant SenseTime is pushing ahead with its initial public offering in Hong Kong despite being placed on a U.S. investment blacklist just one week ago, and the company is keeping its price range unchanged. SenseTime is still seeking to sell 1.5 billion shares at a range of HK$3.85 to HK$3.99 apiece, the company said in a filing to the Hong Kong Stock Exchange. SenseTime, which is based in both Shanghai and Hong Kong, has updated its list of cornerstone investors that now includes state-owned Shanghai Xuhui Capital, Taizhou Culture & Tourism and HKSTP Venture Fund. The cornerstone investors have agreed to subscribe for $512 million worth of SenseTime shares, which amounts to about 67% of the deal. SenseTime will set its final IPO price on Thursday, and it plans to start trading on December 30. “Demand for the company’s IPO i

In [None]:
result = pipeline.annotate(text)
list(result.keys())

['entities', 'document', 'token', 'ner', 'embeddings', 'sentence']

In [None]:
# visualize the NER results
from sparknlp_display import NerVisualizer

NerVisualizer().display(
    result = pipeline.transform(spark.createDataFrame([[text]]).toDF("text")).collect()[0],
    label_col = 'entities',
    document_col = 'document'
)

In [None]:
# get company names which are needed to remove.
companies=[]
for token, ner in zip(result['token'], result['ner']):
  if 'ORG' in ner:
    for entity in result['entities']:
      if token in entity:
        companies.append(entity)

In [None]:
list(set(companies))

['Treasury Department',
 'the Chinese University of Hong Kong.',
 'Xuhui Capital, Taizhou Culture & Tourism',
 '2021.Chinese',
 'HKSTP Venture Fund.',
 'SenseTime’s',
 'SenseTime',
 'Chinese',
 'Hong Kong,',
 'Singapore-based DZT Research.',
 'Hong Kong']

In [None]:
# remove all company names
text_without_companies = text
for company in list(set(companies)):
  text_without_companies = text_without_companies.replace(company, "")
text_without_companies

'Visitors check out the booth belonging to  at the 2021 World Artificial Intelligence  Conference in Shanghai on July 7,  artificial intelligence giant  is pushing ahead with its initial public offering in  despite being placed on a U.S. investment blacklist just one week ago, and the company is keeping its price range unchanged.  is still seeking to sell 1.5 billion shares at a range of HK$3.85 to HK$3.99 apiece, the company said in a filing to the  Stock Exchange. , which is based in both Shanghai and  has updated its list of cornerstone investors that now includes state-owned Shanghai  and  The cornerstone investors have agreed to subscribe for $512 million worth of  shares, which amounts to about 67% of the deal.  will set its final IPO price on Thursday, and it plans to start trading on December 30. “Demand for the company’s IPO is still there, despite U.S. investors being banned from investing into it, ” says Ke Yan, head of research at  The company, however, warned in an updated

## 2. Tokenization, Normalization, Stemming, Lemmatization and Removing stop words

In [None]:
!wget -q https://raw.githubusercontent.com/mahavivo/vocabulary/master/lemmas/AntBNC_lemmas_ver_001.txt

In [None]:
documentAssembler = DocumentAssembler() \
    .setInputCol('text') \
    .setOutputCol('document')

tokenizer = Tokenizer() \
    .setInputCols(['document']) \
    .setOutputCol('token')

# lower cases, remove punctuations and only keep alphabet letters ([^A-Za-z]) 
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")\
    .setLowercase(True)

stemmer = Stemmer() \
    .setInputCols(["token"]) \
    .setOutputCol("stem")

lemmatizer = Lemmatizer() \
    .setInputCols(["token"]) \
    .setOutputCol("lemma") \
    .setDictionary("./AntBNC_lemmas_ver_001.txt", value_delimiter ="\t", key_delimiter = "->")

stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("token")\
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)\
      #.setStopWords(["no", "without"]) (e.g. read a list of words from a txt)

nlp_pipeline = Pipeline(stages=[
    documentAssembler, 
    tokenizer,
    normalizer,
    stemmer,
    lemmatizer,
    stopwords_cleaner
])

empty_df = spark.createDataFrame([['']]).toDF("text")

pipelineModel = nlp_pipeline.fit(empty_df)

In [None]:
df = spark.createDataFrame([[text_without_companies]]).toDF("text")
result_df = pipelineModel.transform(df)
result_df.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|               token|          normalized|                stem|               lemma|         cleanTokens|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Visitors check ou...|[[document, 0, 21...|[[token, 0, 7, Vi...|[[token, 0, 7, vi...|[[token, 0, 7, vi...|[[token, 0, 7, Vi...|[[token, 0, 7, Vi...|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+

