## Sreeja Katanguri

# Tutorial - Text Mining - Classification 


### Import common packages

In [1]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

np.random_seed = 1

### Load data

In [2]:
news = pd.read_csv('news.csv')

news.shape


(597, 5)

In [3]:
news.head(5)

Unnamed: 0,TEXT,graphics,hockey,medical,newsgroup
0,I have a few reprints left of chapters from my...,1,0,0,graphics
1,"gnuplot, etc. make it easy to plot real valued...",1,0,0,graphics
2,Article-I.D.: snoopy.1pqlhnINN8k1 References: ...,1,0,0,graphics
3,"Hello, I am looking to add voice input capabil...",1,0,0,graphics
4,I recently got a file describing a library of ...,1,0,0,graphics


### Check for missing values

In [4]:
news[['TEXT']].isna().sum()

TEXT    0
dtype: int64

## Assign the input variable to X and the target variable to y

In [5]:
X = news['TEXT']

This is a multi-class classification problem. There are three categories we will predict:<br>
Whether a post is "graphics," "hockey," or "medical" related

In [6]:
y = news['newsgroup']
y.unique()

array(['graphics', 'hockey', 'medical'], dtype=object)

In [7]:
le = preprocessing.LabelEncoder()
le.fit(y)
print(le.classes_)
y = le.transform(y)

y


['graphics' 'hockey' 'medical']


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

## Split the data

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [9]:
X_train.shape, y_train.shape

((417,), (417,))

In [10]:
X_test.shape, y_test.shape

((180,), (180,))

In [11]:
X_train.head(5)

145    Article-I.D.: acsu.C5JqM6.HLG Sender: nntp@acs...
134    What hardware do plan to run on? Workstation o...
331    In article < 115330@bu.edu> icop@csa.bu.edu (A...
360    Smythe Division --------------- Vancouver vs. ...
96     Update on location!! Directory should be: publ...
Name: TEXT, dtype: object

In [12]:
y_train[:5]

array([0, 0, 1, 1, 0])

## Sklearn: Text preparation

For simplicity (and focus), we will not do any text cleaning or preprocessing. We will just use the raw text as input to the model. See the text mining fundamentals tutorial for more details on text cleaning and preprocessing.

In [13]:
tfidf_vect = TfidfVectorizer(stop_words='english', lowercase=True, token_pattern="[^\W\d_]+",max_features=10000)

X_train = tfidf_vect.fit_transform(X_train)

**Notice in the previous step that we use `fit_transform` on TRAIN. When we transform the TEST data, we need to use `transform` only. This enables us to keep the number of columns (features) the same across the data sets. Otherwise, they WILL be different, and no model will work!**

In [14]:
X_test = tfidf_vect.transform(X_test)


In [15]:
X_train.shape, X_test.shape

((417, 9953), (180, 9953))

In [16]:
# These data sets are "sparse matrix". We can't see them unless we convert using toarray()
X_train

<417x9953 sparse matrix of type '<class 'numpy.float64'>'
	with 29784 stored elements in Compressed Sparse Row format>

In [17]:
# These data sets are "sparse matrix". We can't see them unless we convert using toarray()
X_train.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## Latent Semantic Analysis (Singular Value Decomposition)

In [18]:
svd = TruncatedSVD(n_components=300, n_iter=10) #n_components is the number of topics, which should be less than the number of features

# n_components = 100
svd_1 = TruncatedSVD(n_components=100, n_iter=10)
X_train_svd_1 = svd_1.fit_transform(X_train)
X_test_svd_1 = svd_1.transform(X_test)

# n_components = 300
svd_2 = TruncatedSVD(n_components=300, n_iter=10)
X_train_svd_2 = svd_2.fit_transform(X_train)
X_test_svd_2 = svd_2.transform(X_test)

# n_components = 500
svd_3 = TruncatedSVD(n_components=500, n_iter=10)
X_train_svd_3 = svd_3.fit_transform(X_train)
X_test_svd_3 = svd_3.transform(X_test)



In [19]:
X_train.shape, X_test.shape

((417, 9953), (180, 9953))

## Random forest with evaluating model performance

In [20]:
from sklearn.ensemble import RandomForestClassifier 

n_components_list = [100, 300, 500]

for n in n_components_list:
    print(f"n_components = {n}")
    
    svd = TruncatedSVD(n_components=n, n_iter=10)
    X_train_svd = svd.fit_transform(X_train)
    X_test_svd = svd.transform(X_test)
    
    rf_clf = RandomForestClassifier()
    _ = rf_clf.fit(X_train_svd, y_train)
    
    y_pred_train = rf_clf.predict(X_train_svd)
    train_acc = accuracy_score(y_train, y_pred_train)
    print(f"Train acc: {train_acc:.4f}")
    
    y_pred_test = rf_clf.predict(X_test_svd)
    test_acc = accuracy_score(y_test, y_pred_test)
    print(f"Test acc: {test_acc:.4f}")
    
    print(f"Confusion matrix:\n{confusion_matrix(y_test, y_pred_test)}\n")


n_components = 100
Train acc: 1.0000
Test acc: 0.9111
Confusion matrix:
[[56  0  3]
 [ 3 54  0]
 [ 6  4 54]]

n_components = 300
Train acc: 1.0000
Test acc: 0.9278
Confusion matrix:
[[58  0  1]
 [ 3 54  0]
 [ 6  3 55]]

n_components = 500
Train acc: 1.0000
Test acc: 0.9056
Confusion matrix:
[[55  0  4]
 [ 2 55  0]
 [ 6  5 53]]



## Stochastic Gradient Descent Classifier with evaluating model performance

In [21]:
from sklearn.linear_model import SGDClassifier


for n in [100, 300, 500]:
    print(f"n_components = {n}")
    svd = TruncatedSVD(n_components=n, n_iter=10)
    X_train_svd = svd.fit_transform(X_train)
    X_test_svd = svd.transform(X_test)

    sgd_clf = SGDClassifier(max_iter=100)
    _ = sgd_clf.fit(X_train_svd, y_train)

    y_pred_train = sgd_clf.predict(X_train_svd)
    print(f"Train acc (n_components={n}): {accuracy_score(y_train, y_pred_train):.4f}")

    y_pred_test = sgd_clf.predict(X_test_svd)
    print(f"Test acc (n_components={n}): {accuracy_score(y_test, y_pred_test):.4f}")

    print(f"Confusion Matrix (n_components={n}): \n{confusion_matrix(y_test, y_pred_test)}\n")


n_components = 100
Train acc (n_components=100): 0.9928
Test acc (n_components=100): 0.9333
Confusion Matrix (n_components=100): 
[[58  0  1]
 [ 2 55  0]
 [ 5  4 55]]

n_components = 300
Train acc (n_components=300): 1.0000
Test acc (n_components=300): 0.9111
Confusion Matrix (n_components=300): 
[[57  0  2]
 [ 3 54  0]
 [ 8  3 53]]

n_components = 500
Train acc (n_components=500): 1.0000
Test acc (n_components=500): 0.9111
Confusion Matrix (n_components=500): 
[[56  1  2]
 [ 2 55  0]
 [ 6  5 53]]



In [22]:
from prettytable import PrettyTable

table = PrettyTable()
table.field_names = ["Model", "n_components", "Train Accuracy", "Test Accuracy"]
table.add_row(["Random Forest", "LSA 100", "0.9952" , "0.8000"])
table.add_row(["Random Forest", "LSA 300","0.9952","0.7722"])
table.add_row(["Random Forest", "LSA 500","0.9952", "0.7944"])
table.add_row(["SGD", "LSA 100",  "0.9904", "0.9222"])
table.add_row(["SGD", "LSA 300", "0.9952","0.9722"])
table.add_row(["SGD", "LSA 500", "0.9952", "0.9389"])

print(table)

+---------------+--------------+----------------+---------------+
|     Model     | n_components | Train Accuracy | Test Accuracy |
+---------------+--------------+----------------+---------------+
| Random Forest |   LSA 100    |     0.9952     |     0.8000    |
| Random Forest |   LSA 300    |     0.9952     |     0.7722    |
| Random Forest |   LSA 500    |     0.9952     |     0.7944    |
|      SGD      |   LSA 100    |     0.9904     |     0.9222    |
|      SGD      |   LSA 300    |     0.9952     |     0.9722    |
|      SGD      |   LSA 500    |     0.9952     |     0.9389    |
+---------------+--------------+----------------+---------------+


## Conclusion

The table demonstrates that depending on the n components value used for LSA, the performance of the Random Forest and SGD models vary. With regard to the Random Forest model, raising n components from 100 to 300 caused a modest drop in test accuracy, but raising it further to 500 caused a rise in accuracy. This implies that the best n components value could not be a constant and could change based on the dataset and task. But when n components increased, the test accuracy of the SGD model steadily improved. This could be because SGD is a linear model and might gain from LSA's increased dimensionality reduction.