# **Text Classification using TF-IDF**

In [1]:
%load_ext watermark
%watermark -a 'Navin Kumar M 20BAI1094' -v -p sklearn,pandas

Author: Navin Kumar M 20BAI1094

Python implementation: CPython
Python version       : 3.8.10
IPython version      : 7.34.0

sklearn: 1.2.0
pandas : 2.0.3



In [2]:
import os 
import pandas as pd
import numpy as np
import chardet
import warnings

from sklearn.feature_extraction.text import (TfidfVectorizer, CountVectorizer)
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn import set_config
from sklearn.naive_bayes import MultinomialNB
 
warnings.filterwarnings("ignore")
set_config(display="diagram")

In [3]:
# Classification Dataset
neg_dataset_path = 'data/rt-polarity.neg.txt'
pos_dataset_path = 'data/rt-polarity.pos.txt'

# read and return the dataset as pandas dataframe
def dataset_return(path: str) -> pd.DataFrame:
    # Check the encoding of the file
    with open(neg_dataset_path, 'rb') as f:
        result = chardet.detect(f.read(3500))
    
    return pd.read_csv(
        neg_dataset_path, encoding=result['encoding'], delimiter='\0', header=None, names=['text']
    ) 

neg_dataset = dataset_return(path=neg_dataset_path)
pos_dataset = dataset_return(path=pos_dataset_path)

In [4]:
# combine the two datasets
neg_dataset['label'] = 0
pos_dataset['label'] = 1

dataset = pd.concat(
    [neg_dataset, pos_dataset], ignore_index=True
    ).sample(frac=1).reset_index(drop=True)

dataset.head()

Unnamed: 0,text,label
0,eight legged freaks won't join the pantheon of...,0
1,a supernatural mystery that doesn't know wheth...,1
2,comes . . . uncomfortably close to coasting in...,1
3,irwin is a man with enough charisma and audaci...,1
4,just too silly and sophomoric to ensnare its t...,0


In [8]:
def tt_split(dataset):
    return train_test_split(
        dataset['text'],
        dataset['label'],
        test_size=0.2,
        stratify=dataset['label'], # Equal Splits
    ) # [X_train, X_text, y_train, y_test] 

X_train, X_test, y_train, y_test = tt_split(dataset)

In [9]:
def generate_report(clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_text)
    print(
        classification_report(y_test, y_pred)
    )

## **CountVectorizer() --- RandomForest()**

In [10]:
clf = Pipeline([
    ('vectorizer_tri_grams', CountVectorizer(ngram_range = (3, 3))),
    ('radom_forest', RandomForestClassifier())
])

generate_report(clf)

              precision    recall  f1-score   support

           0       0.50      0.50      0.50      1067
           1       0.50      0.51      0.50      1066

    accuracy                           0.50      2133
   macro avg       0.50      0.50      0.50      2133
weighted avg       0.50      0.50      0.50      2133



## **CountVectorizer() --- NavieBayes()**

In [11]:
clf = Pipeline([
    ('vectorizer_bigrams', CountVectorizer(ngram_range = (1, 2))),
    ('Multi NB', MultinomialNB())
])

generate_report(clf)

              precision    recall  f1-score   support

           0       0.49      0.50      0.50      1067
           1       0.49      0.48      0.49      1066

    accuracy                           0.49      2133
   macro avg       0.49      0.49      0.49      2133
weighted avg       0.49      0.49      0.49      2133



## **Term Frequency – Inverse Document Frequency**

> __TF-IDF__ is the product of the __TF__ and __IDF__ scores of the term.<br><br> $$\text{TF-IDF}=\frac{\text{TF}}{\text{IDF}}$$<br>

> __Term Frequency :__ This summarizes how often a given word appears within a document.

$$\text{TF} = \frac{\text{Number of times the term appears in the doc}}{\text{Total number of words in the doc}}$$<br><br>
> __Inverse Document Frequency:__ This downscales words that appear a lot across documents. A term has a high IDF score if it appears in a few documents. Conversely, if the term is very common among documents (i.e., “the”, “a”, “is”), the term would have a low IDF score.<br>

$$\text{IDF} = \ln\left(\frac{\text{Number of docs}}{\text{Number docs the term appears in}} \right)$$<br>

> TF-IDF are word frequency scores that try to highlight words that are more interesting, e.g. frequent in a document but not across documents. The higher the TFIDF score, the rarer the term is. For instance, in a Mortgage complaint the word _mortgage_ would be mentioned fairly often. However, if we look at other complaints, _mortgage_ probably would not show up in many of them. We can infer that _mortgage_ is most probably an important word in Mortgage complaints as compared to the other products. Therefore, _mortgage_ would have a high TF-IDF score for Mortgage complaints.

TfidfVectorizer class can be initialized with the following parameters:
* __min_df__: remove the words from the vocabulary which have occurred in less than ‘min_df’ number of files.
* __max_df__: remove the words from the vocabulary which have occurred in more than _‘max_df’ * total number of files in corpus_.
* __sublinear_tf__: set to True to scale the term frequency in logarithmic scale.
* __stop_words__: remove the predefined stop words in 'english'.
* __use_idf__: weight factor must use inverse document frequency.
* __ngram_range__: (1, 2) to indicate that unigrams and bigrams will be considered.

## **TF-IDF() --- RandomForest()**

In [12]:
clf = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer(
        sublinear_tf=True,
        ngram_range=(1, 2),
    )),
    ('Random Forest', RandomForestClassifier())
])

generate_report(clf)

              precision    recall  f1-score   support

           0       0.50      0.52      0.51      1067
           1       0.50      0.48      0.49      1066

    accuracy                           0.50      2133
   macro avg       0.50      0.50      0.50      2133
weighted avg       0.50      0.50      0.50      2133



## **TF-IDF --- RandomForest()  {Preprocessed Data}**
> Pre-processing by removing stop words, punctuations and apply lemmatization

In [16]:
import spacy
nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return ''.join(filtered_tokens)

dataset['text'] = dataset['text'].apply(preprocess)

In [18]:
X_train, X_text, y_train, y_text = tt_split(dataset)

clf = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer(
        sublinear_tf=True,
        ngram_range=(1, 2),
    )),
    ('Random Forest', RandomForestClassifier())
])

generate_report(clf)

              precision    recall  f1-score   support

           0       0.52      0.62      0.56      1067
           1       0.52      0.41      0.46      1066

    accuracy                           0.52      2133
   macro avg       0.52      0.52      0.51      2133
weighted avg       0.52      0.52      0.51      2133

