# Text Classification Using TF-IDF, LSA, and Random Forest
**Author:** Giovanna Cardenas  
**Description:** This notebook performs binary text classification to distinguish between automotive and electronics product reviews. It applies two preprocessing strategies—one with stemming and one without—to compare their impact on model performance. The classification pipeline uses TF-IDF vectorization, Latent Semantic Analysis (LSA), and a Random Forest classifier.

In [186]:
# Load Packages
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from zipfile import ZipFile
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, ENGLISH_STOP_WORDS
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
import nltk
from nltk import word_tokenize
from nltk.stem.snowball import EnglishStemmer
from dmba import printTermDocumentMatrix, classificationSummary
nltk.download('punkt')
import random
random.seed(10)
np.random.seed(10)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/giovannacardenas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Process without Stemming

In [189]:
# Load the zipped file and create document corpus and label vector (0 for electronics, 1 for autos).
# 'ns' prefix used to differentiate data that will not go through stemming process.
ns_corpus = []
ns_label = []
with ZipFile('AutoAndElectronics.zip') as rawData:
    for info in rawData.infolist():
        if info.is_dir(): 
            continue
        ns_label.append(1 if 'rec.autos' in info.filename else 0)
        ns_corpus.append(rawData.read(info))

# Preprocessing (tokenization and stopwords without stemming)
class SimpleTokenizer(object):
    def __init__(self):
        self.stopWords = set(ENGLISH_STOP_WORDS)
    def __call__(self, doc):
        return [t for t in word_tokenize(doc) 
                if t.isalpha() and t.lower() not in self.stopWords]

# Vectorize the corpus using the custom tokenizer
ns_preprocessor = CountVectorizer(tokenizer=SimpleTokenizer(), encoding='latin1')
ns_preprocessedText = ns_preprocessor.fit_transform(ns_corpus)

In [190]:
# Build a term document matrix
ns_td = pd.DataFrame(ns_preprocessedText.todense())
ns_td.columns = ns_preprocessor.get_feature_names_out()
ns_term_document_matrix = ns_td.T
ns_term_document_matrix.columns = ['Sentence '+str(i) for i in range(1, ns_td.shape[0]+1)]
ns_term_document_matrix['total_count'] = ns_term_document_matrix.sum(axis=1)

#Top 25 most frequent words 
ns_term_document_matrix = ns_term_document_matrix.sort_values(by ='total_count',ascending=False)[:25] 

# Print the first 5 rows 
print(ns_term_document_matrix.drop(columns=['total_count']).head(5))

            Sentence 1  Sentence 2  Sentence 3  Sentence 4  Sentence 5  \
subject              2           1           1           2           1   
lines                1           1           1           1           1   
apr                  1           1           1           1           1   
date                 1           1           1           1           1   
newsgroups           1           1           1           1           1   

            Sentence 6  Sentence 7  Sentence 8  Sentence 9  Sentence 10  ...  \
subject              1           1           1           1            1  ...   
lines                1           1           1           1            1  ...   
apr                  1           1           1           1            1  ...   
date                 1           1           1           1            1  ...   
newsgroups           1           1           1           1            1  ...   

            Sentence 1991  Sentence 1992  Sentence 1993  Sentence 1994  \


In [191]:
# Check shape of df without stemming
ns_preprocessedText.shape

(2000, 18721)

### Process with Stemming

In [193]:
# Reload the zipped data to create a new corpus and label vector for stemming
corpus = []
label = []
with ZipFile('AutoAndElectronics.zip') as rawData:
    for info in rawData.infolist():
        if info.is_dir(): 
            continue
        label.append(1 if 'rec.autos' in info.filename else 0)
        corpus.append(rawData.read(info))

# Preprocessing (tokenization, stemming, and stopwords)
class LemmaTokenizer(object):
    def __init__(self):
        self.stemmer = EnglishStemmer()
        self.stopWords = set(ENGLISH_STOP_WORDS)
    def __call__(self, doc):
        return [self.stemmer.stem(t) for t in word_tokenize(doc) 
                if t.isalpha() and t not in self.stopWords]

# Vectorize with stemming
preprocessor = CountVectorizer(tokenizer=LemmaTokenizer(), encoding='latin1')
preprocessedText = preprocessor.fit_transform(corpus)

In [197]:
# Build a term document matrix
td = pd.DataFrame(preprocessedText.todense())
td.columns = preprocessor.get_feature_names_out()
term_document_matrix = td.T
term_document_matrix.columns = ['Sentence '+str(i) for i in range(1, td.shape[0]+1)]
term_document_matrix['total_count'] = term_document_matrix.sum(axis=1)

#Top 25 most frequent words 
term_document_matrix = term_document_matrix.sort_values(by ='total_count',ascending=False)[:25] 

# Print the first 5 rows 
print(term_document_matrix.drop(columns=['total_count']).head(5))

           Sentence 1  Sentence 2  Sentence 3  Sentence 4  Sentence 5  \
line                1           1           1           1           2   
subject             2           1           1           2           1   
car                12           1           0           0           5   
apr                 1           1           1           1           1   
newsgroup           1           1           1           1           1   

           Sentence 6  Sentence 7  Sentence 8  Sentence 9  Sentence 10  ...  \
line                1           1           1           1            1  ...   
subject             1           1           1           1            1  ...   
car                 0           2           3           2            3  ...   
apr                 1           1           1           1            1  ...   
newsgroup           1           1           1           1            1  ...   

           Sentence 1991  Sentence 1992  Sentence 1993  Sentence 1994  \
line         

In [198]:
# Check shape of df with stemming
preprocessedText.shape

(2000, 13516)

### TF-IDF + LSA without Stemming

In [200]:
# Compute TF-IDF
tfidfTransformer = TfidfTransformer()
tfidf = tfidfTransformer.fit_transform(ns_preprocessedText)

# Extract 10 concepts using LSA ()
svd = TruncatedSVD(10, random_state=10)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
lsa_tfidf = lsa.fit_transform(tfidf)

# Split dataset into training and testing sets
Xtrain, Xtest, ytrain, ytest = train_test_split(lsa_tfidf, label, test_size=0.4, random_state=10)

# Run Random Forest Classifier model on training
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=5)
rf_classifier.fit(Xtrain, ytrain)

# Print confusion matrix and accuracy for df without stemming
classificationSummary(ytest, rf_classifier.predict(Xtest))
tfidfTransformer = TfidfTransformer()
tfidf = tfidfTransformer.fit_transform(ns_preprocessedText)

Confusion Matrix (Accuracy 0.9575)

       Prediction
Actual   0   1
     0 393  13
     1  21 373


### TF-IDF + LSA with Stemming

In [202]:
# Compute TF-IDF
tfidfTransformer = TfidfTransformer()
tfidf = tfidfTransformer.fit_transform(preprocessedText)

# Extract 10 concepts using LSA ()
svd = TruncatedSVD(10, random_state= 10)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
lsa_tfidf = lsa.fit_transform(tfidf)

# Split dataset into training and testing sets
Xtrain, Xtest, ytrain, ytest = train_test_split(lsa_tfidf, label, test_size=0.4, random_state=10)

# Run Random Forest Classifier model on training
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=5)
rf_classifier.fit(Xtrain, ytrain)

# Print confusion matrix and accuracy for df with stemming
classificationSummary(ytest, rf_classifier.predict(Xtest))
tfidfTransformer = TfidfTransformer()
tfidf = tfidfTransformer.fit_transform(preprocessedText)

Confusion Matrix (Accuracy 0.9613)

       Prediction
Actual   0   1
     0 393  13
     1  18 376


In [208]:
# Stemming reduced our data frame by 5205 rows which helps reduce redundancy. Also, without stemming, the accuracy of the model is 95.75% but with stemming, it is 96.13%.

# The concept matrix reports what tokens are present, frequent, or infrequent. The TF-IDF measures the importance of each token.