In [None]:
import nltk
import string
from nltk.tokenize import word_tokenize
import pandas as pd

# Natural Language Processing

## News Category Dataset
Identify the type of news based on headlines and short descriptions

https://www.kaggle.com/rmisra/news-category-dataset

### Load the data

In [None]:
# only the TECH category was selected
df = pd.read_json('data/News_Category_Dataset_v2.json', lines=True)
df.head()

In [None]:
# concatenate the headline and the short description
df['headline_short_description'] = df['headline'] + ' - ' + df['short_description']
corpus = list(df['headline_short_description'].values)
news = corpus.copy()
news

In [None]:
# get the list of most popular words
from nltk.probability import FreqDist
import itertools
nltk.download('punkt')
news_chart = news.copy()
for i in range(len(news_chart)):
    news_chart[i] = word_tokenize(news_chart[i])
# merge a list of lists
all_news = list(itertools.chain.from_iterable(news_chart))
# frequency distribution
fdist = FreqDist(all_news)
print(fdist.most_common(30))
# Frequency Distribution Plot
import matplotlib.pyplot as plt
fdist.plot(30,cumulative=False)
plt.show()

### Lowercase, remove punctuation, spaces

In [None]:
for i in range(len(news)):
    # lowercase
    news[i] = str(news[i]).lower()
    # remove punctuation
    translator = str.maketrans('','',string.punctuation)
    news[i] = news[i].translate(translator)
    # remove spaces at the begenning and at the end
    news[i] = news[i].strip()
news

### Tokenize

In [None]:
# nltk.download('punkt')
for i in range(len(news)):
    news[i] = word_tokenize(news[i])
news

### Stopwords

In [None]:
from nltk.corpus import stopwords
# nltk.download('stopwords')

# for a more complete list of stopwords and in other languages: https://www.ranks.nl/stopwords
# pritn the stop_words
stop_words = set(stopwords.words('english'))
print(stop_words)

for i in range(len(news)):
    news[i] = [word for word in news[i] if not word in stop_words]
news

### Frequency Distribution

In [None]:
from nltk.probability import FreqDist
import itertools

# merge a list of lists
all_news = list(itertools.chain.from_iterable(news))

# Frequency distribution
fdist = FreqDist(all_news)
print(fdist.most_common(30))

# Frequency Distribution Plot
import matplotlib.pyplot as plt
fdist.plot(30,cumulative=False)
plt.show()

### Stemming

In [None]:
# Stemming
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

for i in range(5):
    stemmed_words=[]
    for word in news[i]:
        stemmed_words.append(stemmer.stem(word))

    print('Original senstence:', news[i])
    print('Stemmed sentence:', stemmed_words)
    print('')

### Lemmatization

In [None]:
# Lemmatization

from nltk.stem.wordnet import WordNetLemmatizer
# nltk.download('wordnet')

wordnet_lemmatize = WordNetLemmatizer()

for i in range(5):
    lemmatized_words=[]
    for word in news[i]:
        lemmatized_words.append(wordnet_lemmatize.lemmatize(word))

    print('Original senstence:', news[i])
    print('Lemmatized sentence:', lemmatized_words)
    print('')

In [None]:
# Stemming x Lemmatization

stemmer = PorterStemmer()
wordnet_lemmatize = WordNetLemmatizer()

sentence = 'cities wolves children'
tokens=nltk.word_tokenize(sentence)

stemmed_words=[]
lemmatized_words = []
for word in tokens:
    stemmed_words.append(stemmer.stem(word))
    lemmatized_words.append(wordnet_lemmatize.lemmatize(word))

print('Original senstence:', tokens)
print('Stemmed sentence:', stemmed_words)
print('Lemmatized sentence:', lemmatized_words)
print('')

In [None]:
# POS Tagging

# nltk.download('averaged_perceptron_tagger')

# Alphabetical list of part-of-speech tags
# https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

sentence = corpus[0]
tokens=nltk.word_tokenize(sentence)
print(tokens)
print(nltk.pos_tag(tokens))


In [None]:
#Named-entity recognition

#nltk.download('maxent_ne_chunker')
#nltk.download('words')

from nltk import word_tokenize, pos_tag, ne_chunk

sentence = 'Prime Minister Justin Trudeau campaigns in Montreal riding of Outremont ahead of byelection'
print(ne_chunk(pos_tag(word_tokenize(sentence))))

In [None]:
# Synonyms 

from nltk.corpus import wordnet

for ss in wordnet.synsets('small'):
    print('Name:', ss.name())
    print('Synonyms:', ss.lemma_names())
    print('Definition:', ss.definition())
    print('Examples:', ss.examples()) 
    print()

# Sklearn - Feature Extraction

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
df_counts = pd.DataFrame(X.toarray())
df_counts.columns = vectorizer.get_feature_names()
df_count = df_counts.sum(axis=0).sort_values(ascending=False).reset_index()
df_count


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
df_counts = pd.DataFrame(X.toarray())
df_counts.columns = vectorizer.get_feature_names()
df_count = df_counts.sum(axis=0).sort_values(ascending=False).reset_index()
df_count


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(lowercase = True, strip_accents = 'ascii', stop_words = 'english', ngram_range = (1,2), min_df = 10)
X = vectorizer.fit_transform(corpus)
df_counts = pd.DataFrame(X.toarray())
df_counts.columns = vectorizer.get_feature_names()
df_count = df_counts.sum(axis=0).sort_values(ascending=False).reset_index()
df_count['rank'] = df_count[0].rank(axis=0, ascending=False)
df_count.columns = ['word', 'count', 'rank_count']
df_count


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(lowercase = True, strip_accents = 'ascii', stop_words = 'english', ngram_range = (1,2), min_df = 10)
X = vectorizer.fit_transform(corpus)
df_tfidf = pd.DataFrame(X.toarray())
df_tfidf.columns = vectorizer.get_feature_names()
df_tfidf.sort_values('zuckerberg', ascending=False).head()
df_tfidf = df_tfidf.sum(axis=0).sort_values(ascending=False).reset_index()
df_tfidf['rank'] = df_tfidf[0].rank(axis=0, ascending=False)
df_tfidf.columns = ['word', 'tfidf', 'rank_tfidf']
df_tfidf


In [None]:
df_compare = df_count.merge(df_tfidf, on='word', how='left')
df_compare['diff'] = abs(df_compare['rank_tfidf']-df_compare['rank_count'])
df_compare = df_compare[(df_compare['rank_tfidf']<50)&(df_compare['rank_count']<50)]
df_compare.sort_values('diff', ascending=False)