## Imports

In [1]:
!pip install plotly --upgrade



In [2]:
import nltk
import pandas as pd
import plotly.express as px
import string

In [3]:
nltk.download('popular')

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to-date!
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Package names is already up-to-date!
[nltk_data]    | Do

True

In [4]:
## Source article: https://www.reuters.com/world/africa/safrica-says-it-is-being-punished-early-covid-variant-detection-2021-11-27/

article_text = '''
JOHANNESBURG, Nov 27 (Reuters) - South Africa said on Saturday it was being punished for its advanced ability to detect new COVID-19 variants early, as travel bans and restrictions imposed because of the new Omicron variant threaten to harm tourism and other sectors of the economy.

South Africa has some of the world's top epidemiologists and scientists, who have managed to detect emerging coronavirus variants and their mutations early on in their life cycle. The Omicron variant was first discovered in South Africa and has since been detected in Belgium, Botswana, Israel and Hong Kong.

"This latest round of travel bans is akin to punishing South Africa for its advanced genomic sequencing and the ability to detect new variants quicker," the Ministry of International Relations and Cooperation said.

"Excellent science should be applauded and not punished," it said in a statement.

Many nations rushed on Friday and Saturday to announce travel curbs to South Africa and other countries in the region.

The foreign ministry noted that while the new variant was also detected in other countries, the global reaction to those countries have been "starkly different" to cases in southern Africa.

The new variant was first announced on Wednesday by a team of scientists in South Africa who said they had detected a variant that could possibly evade the body's immune response and make it more transmissible.

On Friday the World Health Organization named it Omicron and designated it as a "variant of concern" - its most serious level - saying preliminary evidence suggests an increased risk of re-infection. read more

"Our immediate concern is the damage that these restrictions are causing to families, the travel and tourism industries and business," South African Foreign Minister Naledi Pandor said in the statement.

The government was engaging with countries that have imposed travel bans to persuade them to reconsider, it added.

On Friday, the WHO cautioned countries against hastily imposing travel restrictions linked to the variant, saying they should take a "risk-based and scientific approach".'''

## Tokenization

In [5]:
sentence_tokens = nltk.sent_tokenize(article_text)
sentence_tokens

['\nJOHANNESBURG, Nov 27 (Reuters) - South Africa said on Saturday it was being punished for its advanced ability to detect new COVID-19 variants early, as travel bans and restrictions imposed because of the new Omicron variant threaten to harm tourism and other sectors of the economy.',
 "South Africa has some of the world's top epidemiologists and scientists, who have managed to detect emerging coronavirus variants and their mutations early on in their life cycle.",
 'The Omicron variant was first discovered in South Africa and has since been detected in Belgium, Botswana, Israel and Hong Kong.',
 '"This latest round of travel bans is akin to punishing South Africa for its advanced genomic sequencing and the ability to detect new variants quicker," the Ministry of International Relations and Cooperation said.',
 '"Excellent science should be applauded and not punished," it said in a statement.',
 'Many nations rushed on Friday and Saturday to announce travel curbs to South Africa and

In [6]:
word_tokens = nltk.word_tokenize(article_text.lower())
word_tokens

['johannesburg',
 ',',
 'nov',
 '27',
 '(',
 'reuters',
 ')',
 '-',
 'south',
 'africa',
 'said',
 'on',
 'saturday',
 'it',
 'was',
 'being',
 'punished',
 'for',
 'its',
 'advanced',
 'ability',
 'to',
 'detect',
 'new',
 'covid-19',
 'variants',
 'early',
 ',',
 'as',
 'travel',
 'bans',
 'and',
 'restrictions',
 'imposed',
 'because',
 'of',
 'the',
 'new',
 'omicron',
 'variant',
 'threaten',
 'to',
 'harm',
 'tourism',
 'and',
 'other',
 'sectors',
 'of',
 'the',
 'economy',
 '.',
 'south',
 'africa',
 'has',
 'some',
 'of',
 'the',
 'world',
 "'s",
 'top',
 'epidemiologists',
 'and',
 'scientists',
 ',',
 'who',
 'have',
 'managed',
 'to',
 'detect',
 'emerging',
 'coronavirus',
 'variants',
 'and',
 'their',
 'mutations',
 'early',
 'on',
 'in',
 'their',
 'life',
 'cycle',
 '.',
 'the',
 'omicron',
 'variant',
 'was',
 'first',
 'discovered',
 'in',
 'south',
 'africa',
 'and',
 'has',
 'since',
 'been',
 'detected',
 'in',
 'belgium',
 ',',
 'botswana',
 ',',
 'israel',
 'and

In [7]:
freq_dist_words = nltk.FreqDist(word_tokens)
freq_dist_words

FreqDist({"''": 7,
          "'s": 2,
          '(': 1,
          ')': 1,
          ',': 13,
          '-': 3,
          '.': 12,
          '27': 1,
          '``': 5,
          'a': 5,
          'ability': 2,
          'added': 1,
          'advanced': 2,
          'africa': 7,
          'african': 1,
          'against': 1,
          'akin': 1,
          'also': 1,
          'an': 1,
          'and': 16,
          'announce': 1,
          'announced': 1,
          'applauded': 1,
          'approach': 1,
          'are': 1,
          'as': 2,
          'bans': 3,
          'be': 1,
          'because': 1,
          'been': 2,
          'being': 1,
          'belgium': 1,
          'body': 1,
          'botswana': 1,
          'business': 1,
          'by': 1,
          'cases': 1,
          'causing': 1,
          'cautioned': 1,
          'concern': 2,
          'cooperation': 1,
          'coronavirus': 1,
          'could': 1,
          'countries': 5,
          'covid-19': 1,
   

In [8]:
df_freq = pd.DataFrame.from_dict(freq_dist_words, orient='index', columns=['Frequency'])
df_freq.reset_index(inplace=True)
df_freq.sort_values('Frequency', ascending=False, inplace=True)
top_20_words = df_freq.head(20)
top_20_words = top_20_words.rename(columns={'index': 'Word'})

In [15]:
fig = px.bar(top_20_words, x='Word', y='Frequency', color='Frequency')

fig.update_layout(height=800, width=1200, template='plotly_dark', title='Distribution of Top 20 Words From Article')
fig.show()

## Stopwords Removal

In [10]:
word_tokens_clean = []
my_punct = ["''", "``", '""', "'s"]

for word in word_tokens:
  if word not in nltk.corpus.stopwords.words('english'):
    if word not in string.punctuation:
      if word not in my_punct:
        word_tokens_clean.append(word)

print(word_tokens_clean)

['johannesburg', 'nov', '27', 'reuters', 'south', 'africa', 'said', 'saturday', 'punished', 'advanced', 'ability', 'detect', 'new', 'covid-19', 'variants', 'early', 'travel', 'bans', 'restrictions', 'imposed', 'new', 'omicron', 'variant', 'threaten', 'harm', 'tourism', 'sectors', 'economy', 'south', 'africa', 'world', 'top', 'epidemiologists', 'scientists', 'managed', 'detect', 'emerging', 'coronavirus', 'variants', 'mutations', 'early', 'life', 'cycle', 'omicron', 'variant', 'first', 'discovered', 'south', 'africa', 'since', 'detected', 'belgium', 'botswana', 'israel', 'hong', 'kong', 'latest', 'round', 'travel', 'bans', 'akin', 'punishing', 'south', 'africa', 'advanced', 'genomic', 'sequencing', 'ability', 'detect', 'new', 'variants', 'quicker', 'ministry', 'international', 'relations', 'cooperation', 'said', 'excellent', 'science', 'applauded', 'punished', 'said', 'statement', 'many', 'nations', 'rushed', 'friday', 'saturday', 'announce', 'travel', 'curbs', 'south', 'africa', 'count

In [11]:
freq_dist_words_clean = nltk.FreqDist(word_tokens_clean)

In [12]:
df_freq_clean = pd.DataFrame.from_dict(freq_dist_words_clean, orient='index', columns=['Frequency'])
df_freq_clean.reset_index(inplace=True)
df_freq_clean.sort_values('Frequency', ascending=False, inplace=True)
top_20_words_clean = df_freq_clean.head(20)
top_20_words_clean = top_20_words_clean.rename(columns={'index': 'Word'})

In [14]:
fig = px.bar(top_20_words_clean, x='Word', y='Frequency', color='Frequency')

fig.update_layout(height=800, width=1200, template='plotly_dark', title='Distribution of Top 20 Words From Article')
fig.show()
