In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import wordnet

In [26]:
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/herrakaava/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/herrakaava/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/herrakaava/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

<h3>Read in the data</h3>

In [3]:
df = pd.read_csv('/Users/herrakaava/Desktop/Github_repos/NLP/data/bbc_text_cls.csv', index_col=0)

In [4]:
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2225 entries, 0 to 2224
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    2225 non-null   object
 1   labels  2225 non-null   object
dtypes: object(2)
memory usage: 52.1+ KB


In [6]:
df.isna().any()

text      False
labels    False
dtype: bool

In [7]:
# One document (row)
print(df['text'][0])

Ad sales boost Time Warner profit

Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (Â£600m) for the three months to December, from $639m year-earlier.

The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.

Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to sign up AO

<h3>Train-test split</h3>

In [8]:
X = df['text']
y = df['labels']

In [9]:
# Class distribution of the target
y.value_counts(normalize=True)

labels
sport            0.229663
business         0.229213
politics         0.187416
tech             0.180225
entertainment    0.173483
Name: proportion, dtype: float64

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.2, 
                                                    random_state=42)

In [11]:
print(f'Shape of X_train: {X_train.shape}')
print(f'Shape of y_train: {y_train.shape}')
print()
print(f'Shape of X_test: {X_test.shape}')
print(f'Shape of y_test: {y_test.shape}')

Shape of X_train: (1780,)
Shape of y_train: (1780,)

Shape of X_test: (445,)
Shape of y_test: (445,)


<h3>CountVectorizer</h3>

In [12]:
def f(Xtrain, Xtest, analyzer='word', stop_words=None):
    vectorizer = CountVectorizer(analyzer=analyzer, stop_words=stop_words)
    X_train_trans = vectorizer.fit_transform(Xtrain)
    X_test_trans = vectorizer.transform(Xtest)
    return X_train_trans, X_test_trans

In [13]:
X_train_trans, X_test_trans = f(X_train, X_test)

In [14]:
print(X_train_trans.shape)
print(X_test_trans.shape)

(1780, 26762)
(445, 26762)


- The number of columns tells us that the *training corpus* consists of $\, 26762 \,$ unique words.
- These $\, 26762 \,$ words form the *vocabulary* of the corpus.
- Here we convert the text into vectors of numbers, where each number represents how many times each word of the vocabulary appears in each document (row).

In [15]:
print(type(X_train_trans))
print(type(X_test_trans))

<class 'scipy.sparse._csr.csr_matrix'>
<class 'scipy.sparse._csr.csr_matrix'>


In [16]:
# Percentage of non-zero values
((X_train_trans != 0).sum() / np.prod(X_train_trans.shape)) * 100

np.float64(0.7536029201223603)

In [17]:
# Number of non-zero values
X_train_trans.nnz

358989

<h3>Naive Bayes</h3>

In [18]:
model = MultinomialNB()
model.fit(X_train_trans, y_train)
print(f'Training accuracy: {accuracy_score(y_train, model.predict(X_train_trans))}')
print(f'Test accuracy: {accuracy_score(y_test, model.predict(X_test_trans))}')

Training accuracy: 0.9943820224719101
Test accuracy: 0.9730337078651685


<br>

In [21]:
# With stopwords
X_train_trans2, X_test_trans2 = f(X_train, X_test, stop_words='english')
model2 = MultinomialNB()
model2.fit(X_train_trans2, y_train)
print(f'Training accuracy: {accuracy_score(y_train, model2.predict(X_train_trans2))}')
print(f'Test accuracy: {accuracy_score(y_test, model2.predict(X_test_trans2))}')

Training accuracy: 0.9960674157303371
Test accuracy: 0.9730337078651685


<br>

In [22]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [24]:
class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
        
    def __call__(self, document):
        tokens = word_tokenize(document)
        words_and_tags = nltk.pos_tag(tokens)
        return [self.wnl.lemmatize(word, pos=get_wordnet_pos(tag)) for word, tag in words_and_tags]

In [32]:
# With lemmatization
vectorizer3 = CountVectorizer(tokenizer=LemmaTokenizer())
X_train_trans3 = vectorizer3.fit_transform(X_train)
X_test_trans3 = vectorizer3.transform(X_test)
model3 = MultinomialNB()
model3.fit(X_train_trans3, y_train)
print(f'Training accuracy: {accuracy_score(y_train, model3.predict(X_train_trans3))}')
print(f'Test accuracy: {accuracy_score(y_test, model3.predict(X_test_trans3))}')



Training accuracy: 0.9932584269662922
Test accuracy: 0.9730337078651685


<br>

In [29]:
class StemTokenizer:
    def __init__(self):
        self.porter = PorterStemmer()
        
    def __call__(self, document):
        tokens = word_tokenize(document)
        return [self.porter.stem(t) for t in tokens]

In [33]:
# With stemming
vectorizer4 = CountVectorizer(tokenizer=StemTokenizer())
X_train_trans4 = vectorizer4.fit_transform(X_train)
X_test_trans4 = vectorizer4.transform(X_test)
model4 = MultinomialNB()
model4.fit(X_train_trans4, y_train)
print(f'Training accuracy: {accuracy_score(y_train, model4.predict(X_train_trans4))}')
print(f'Test accuracy: {accuracy_score(y_test, model4.predict(X_test_trans4))}')



Training accuracy: 0.9921348314606742
Test accuracy: 0.9730337078651685


<br>

In [34]:
def simple_tokenizer(s):
    return s.split()

In [42]:
# String split tokenizer
vectorizer5 = CountVectorizer(tokenizer=simple_tokenizer)
X_train_trans5 = vectorizer5.fit_transform(X_train)
X_test_trans5 = vectorizer5.transform(X_test)
model5 = MultinomialNB()
model5.fit(X_trans_trans5, y_train)
print(f'Training accuracy: {accuracy_score(y_train, model5.predict(X_train_trans5))}')
print(f'Test accuracy: {accuracy_score(y_test, model5.predict(X_test_trans5))}')

Training accuracy: 0.9977528089887641
Test accuracy: 0.9595505617977528


