# Text classification for 20Newsgroup dataset
## Stemming VS Lemmatization  using TF-IDF

By - Gargi Mishra

M.Tech Computer Science and Technology

SCSS, JNU

### Importing libraries

In [1]:
import numpy as np
import pandas as pd
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
import nltk

### 1) Importing dataset
#### Training data

In [2]:
#Loading the data set - training data.
from sklearn.datasets import fetch_20newsgroups
mydata_train = fetch_20newsgroups(subset='train', shuffle=True, remove = ('headers', 'footers', 'quotes'))

In [3]:
print('Training data size:', len(mydata_train['data']))

Training data size: 11314


In [4]:
len(mydata_train['target'])

11314

In [5]:
# Printing all the categories
mydata_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [6]:
# Finding frequency of each category
targets, frequency = np.unique(mydata_train.target, return_counts=True)
targets, frequency 

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19]),
 array([480, 584, 591, 590, 578, 593, 585, 594, 598, 597, 600, 595, 591,
        594, 593, 599, 546, 564, 465, 377], dtype=int64))

#### Test data

In [7]:
#Loading the dataset - test data
mydata_test = fetch_20newsgroups(subset='test', shuffle=True, remove = ('headers', 'footers', 'quotes'))

In [8]:
print('Testing data size:', len(mydata_test['data']))

Testing data size: 7532


In [9]:
# Finding frequency of each category
targets_test, frequency_test = np.unique(mydata_test.target, return_counts=True)
targets_test, frequency_test

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19]),
 array([319, 389, 394, 392, 385, 395, 390, 396, 398, 397, 399, 396, 393,
        396, 394, 398, 364, 376, 310, 251], dtype=int64))

### 2) Data preprocessing

In [10]:
#Train data
mydata_train_df = pd.DataFrame({'data': mydata_train.data, 'target': mydata_train.target})

#Test data
mydata_test_df = pd.DataFrame({'data': mydata_test.data, 'target': mydata_test.target})

mydata_train_df.head()

Unnamed: 0,data,target
0,I was wondering if anyone out there could enli...,7
1,A fair number of brave souls who upgraded thei...,4
2,"well folks, my mac plus finally gave up the gh...",4
3,\nDo you have Weitek's address/phone number? ...,1
4,"From article <C5owCB.n3p@world.std.com>, by to...",14


In [11]:
# Text preprocessing steps - remove numbers, captial letters and punctuation
import re
import string

alphanumeric = lambda x: re.sub(r"""\w*\d\w*""", ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

mydata_train_df['data'] = mydata_train_df.data.map(alphanumeric).map(punc_lower)
mydata_test_df = pd.DataFrame({'data': mydata_test.data, 'target': mydata_test.target})

#### TF-IDF vectorizer

In [12]:
# Creating a document-term matrix using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

tfidfV = TfidfVectorizer(stop_words='english') 

X_train_tfidfV = tfidfV.fit_transform(mydata_train_df.data) 
X_test_tfidfV = tfidfV.transform(mydata_test_df.data) 

print(X_train_tfidfV.shape)

(11314, 67822)


## Model building

### I. Not using stemming and lemmatization

In [13]:
# Use a Naive Bayes model 
from sklearn.naive_bayes import MultinomialNB 

mnb = MultinomialNB() 

# Train the model 
y_train = mydata_train_df.target
mnb.fit(X_train_tfidfV, y_train) 

y_pred_mnb = mnb.predict(X_test_tfidfV) 

In [14]:
from sklearn.metrics import classification_report, accuracy_score

y_test = mydata_test_df.target
print(classification_report(y_test, y_pred_mnb))
print(accuracy_score(y_test, y_pred_mnb))

              precision    recall  f1-score   support

           0       0.78      0.19      0.31       319
           1       0.66      0.67      0.67       389
           2       0.66      0.60      0.63       394
           3       0.56      0.73      0.63       392
           4       0.75      0.64      0.69       385
           5       0.80      0.76      0.78       395
           6       0.80      0.75      0.77       390
           7       0.84      0.71      0.77       396
           8       0.85      0.73      0.79       398
           9       0.92      0.81      0.86       397
          10       0.57      0.94      0.71       399
          11       0.59      0.80      0.68       396
          12       0.69      0.52      0.59       393
          13       0.88      0.76      0.81       396
          14       0.78      0.75      0.76       394
          15       0.38      0.92      0.54       398
          16       0.57      0.72      0.64       364
          17       0.82    

### II. Using stemming

In [15]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
stemmer = SnowballStemmer('english')
    
def stem_text(text):
    return(" ".join([stemmer.stem(w) for w in w_tokenizer.tokenize(text)]))

mydata_train_df["data_stemmed"] = mydata_train_df.data.apply(stem_text)
mydata_test_df["data_stemmed"] = mydata_test_df.data.apply(stem_text)

stfidfV = TfidfVectorizer(stop_words='english') 

X_train_stfidfV = stfidfV.fit_transform(mydata_train_df.data_stemmed)
X_test_stfidfV = stfidfV.transform(mydata_test_df.data_stemmed)

In [16]:
# Train the model 

mnb.fit(X_train_stfidfV, y_train) 

y_pred_mnb_s = mnb.predict(X_test_stfidfV) 

In [17]:
print(classification_report(y_test, y_pred_mnb_s))
print(accuracy_score(y_test, y_pred_mnb_s))

              precision    recall  f1-score   support

           0       0.77      0.14      0.24       319
           1       0.64      0.65      0.64       389
           2       0.68      0.55      0.61       394
           3       0.56      0.73      0.63       392
           4       0.75      0.64      0.69       385
           5       0.80      0.77      0.78       395
           6       0.80      0.68      0.74       390
           7       0.83      0.72      0.77       396
           8       0.83      0.72      0.77       398
           9       0.91      0.78      0.84       397
          10       0.57      0.93      0.70       399
          11       0.54      0.79      0.64       396
          12       0.71      0.51      0.59       393
          13       0.87      0.74      0.80       396
          14       0.78      0.73      0.76       394
          15       0.35      0.91      0.51       398
          16       0.54      0.69      0.60       364
          17       0.83    

### III. Using lemmatization

In [18]:
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return(" ".join([lemmatizer.lemmatize(w,"v") for w in w_tokenizer.tokenize(text)]))

mydata_train_df["data_lemmatized"] = mydata_train_df.data.apply(lemmatize_text)
mydata_test_df["data_lemmatized"] = mydata_test_df.data.apply(lemmatize_text)

ltfidfV = TfidfVectorizer(stop_words='english') 

X_train_ltfidfV = ltfidfV.fit_transform(mydata_train_df.data_lemmatized)
X_test_ltfidfV = ltfidfV.transform(mydata_test_df.data_lemmatized)

In [19]:
mnb.fit(X_train_ltfidfV, y_train) 

y_pred_mnb_l = mnb.predict(X_test_ltfidfV) 

In [20]:
print(classification_report(y_test, y_pred_mnb_l))
print(accuracy_score(y_test, y_pred_mnb_l))

              precision    recall  f1-score   support

           0       0.78      0.18      0.29       319
           1       0.68      0.67      0.67       389
           2       0.68      0.59      0.63       394
           3       0.57      0.70      0.63       392
           4       0.76      0.66      0.71       385
           5       0.79      0.76      0.77       395
           6       0.80      0.73      0.76       390
           7       0.82      0.72      0.77       396
           8       0.83      0.71      0.77       398
           9       0.92      0.79      0.85       397
          10       0.57      0.93      0.70       399
          11       0.58      0.81      0.67       396
          12       0.69      0.53      0.60       393
          13       0.87      0.76      0.81       396
          14       0.80      0.76      0.78       394
          15       0.38      0.92      0.53       398
          16       0.54      0.71      0.62       364
          17       0.80    