# Imports

In [63]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import eli5
import warnings
warnings.filterwarnings('ignore')

# Input Data & Labels (Data Selection)

In [64]:
# Example list of sentences
sentences = [
    "This is a sentence.",
    "This is another sentence.",
    "Yet another sent.",
    "One more sent for the example.",
    "This is a new example sentence.",
    "Another new sent for testing purposes."
]
labels = [1, 1, 0, 0, 1, 0]  # Example binary labels

# Pre-Processing Part (Sentence Cleaning, Stemming, Lemmatisation, etc.)

In [62]:
# Add According to your data

# Count Vectorizer

In [51]:
# Create CountVectorizer
count_vectorizer = CountVectorizer()
count_vectors = count_vectorizer.fit_transform(sentences)

# Print important details about CountVectorizer
print("CountVectorizer Vocabulary:")
print(count_vectorizer.vocabulary_)
print("CountVectorizer Feature Names:")
print(count_vectorizer.vocabulary_.keys())
print("CountVectorizer Shape:")
print(count_vectors.shape)

CountVectorizer Vocabulary:
{'this': 12, 'is': 3, 'sentence': 9, 'another': 0, 'yet': 13, 'sent': 8, 'one': 6, 'more': 4, 'for': 2, 'the': 11, 'example': 1, 'new': 5, 'testing': 10, 'purposes': 7}
CountVectorizer Feature Names:
dict_keys(['this', 'is', 'sentence', 'another', 'yet', 'sent', 'one', 'more', 'for', 'the', 'example', 'new', 'testing', 'purposes'])
CountVectorizer Shape:
(6, 14)


In [52]:
count_vectors.toarray()

array([[0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0],
       [1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1],
       [0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0],
       [0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0],
       [1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0]])

In [53]:
count_vectorizer.vocabulary_

{'this': 12,
 'is': 3,
 'sentence': 9,
 'another': 0,
 'yet': 13,
 'sent': 8,
 'one': 6,
 'more': 4,
 'for': 2,
 'the': 11,
 'example': 1,
 'new': 5,
 'testing': 10,
 'purposes': 7}

# TF-IDF Vectorizer

In [55]:
# Create TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=8,ngram_range=(2,3))
tfidf_vectors = tfidf_vectorizer.fit_transform(sentences)

# Print important details about TfidfVectorizer
print("\nTfidfVectorizer Vocabulary:")
print(tfidf_vectorizer.vocabulary_)
print("TfidfVectorizer Feature Names:")
print(tfidf_vectorizer.get_feature_names_out())
print("TfidfVectorizer Shape:")
print(tfidf_vectors.shape)


TfidfVectorizer Vocabulary:
{'this is': 4, 'this is sentence': 7, 'this is another': 5, 'sent for': 1, 'the example': 3, 'this is new': 6, 'another new': 0, 'testing purposes': 2}
TfidfVectorizer Feature Names:
['another new' 'sent for' 'testing purposes' 'the example' 'this is'
 'this is another' 'this is new' 'this is sentence']
TfidfVectorizer Shape:
(6, 8)


In [56]:
tfidf_vectors.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.56921261,
        0.        , 0.        , 0.82219037],
       [0.        , 0.        , 0.        , 0.        , 0.56921261,
        0.82219037, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.6340862 , 0.        , 0.77326237, 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.56921261,
        0.        , 0.82219037, 0.        ],
       [0.61171251, 0.50161301, 0.61171251, 0.        , 0.        ,
        0.        , 0.        , 0.        ]])

# Train-Test Split (For Best Practice, This part should be done before using vectorizer)

In [57]:
# For Best Practice, This part should be done before using vectorizer 
# Split data for training and testing
X_train_count, X_test_count, y_train, y_test = train_test_split(count_vectors, labels, test_size=0.2, random_state=42)
X_train_tfidf, X_test_tfidf, _, _ = train_test_split(tfidf_vectors, labels, test_size=0.2, random_state=42)

# Trying Models

In [58]:
# Fit logistic regression model
logistic_model_count = LogisticRegression()
logistic_model_count.fit(X_train_count, y_train)

logistic_model_tfidf = LogisticRegression()
logistic_model_tfidf.fit(X_train_tfidf, y_train)

# Getting Prediction on Test Data

In [59]:
# Predict using logistic regression model
y_pred_count = logistic_model_count.predict(X_test_count)
y_pred_tfidf = logistic_model_tfidf.predict(X_test_tfidf)

# Results

In [65]:
# Classification report and confusion matrix
print("\nClassification Report (CountVectorizer):")
print(classification_report(y_test, y_pred_count))
print("Confusion Matrix (CountVectorizer):")
print(confusion_matrix(y_test, y_pred_count))

print("\nClassification Report (TfidfVectorizer):")
print(classification_report(y_test, y_pred_tfidf))
print("Confusion Matrix (TfidfVectorizer):")
print(confusion_matrix(y_test, y_pred_tfidf))


Classification Report (CountVectorizer):
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       0.0
           1       0.00      0.00      0.00       2.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0

Confusion Matrix (CountVectorizer):
[[0 0]
 [2 0]]

Classification Report (TfidfVectorizer):
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       0.0
           1       0.00      0.00      0.00       2.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0

Confusion Matrix (TfidfVectorizer):
[[0 0]
 [2 0]]


# Explanation

In [66]:
#CountVectorizer
eli5.show_weights(logistic_model_count, vec=count_vectorizer.vocabulary_,top=10)

Weight?,Feature
+0.378,x9
+0.378,x12
+0.378,x3
+0.259,x5
+0.249,x1
… 5 more negative …,… 5 more negative …
-0.130,x13
-0.248,x2
-0.249,x0
-0.378,x8


In [67]:
#TfIdfVect.
eli5.show_weights(logistic_model_tfidf, vec=tfidf_vectorizer, feature_names= tfidf_vectorizer.get_feature_names_out(),top=10)

Weight?,Feature
0.517,this is new
0.358,this is
-0.119,another new
-0.119,testing purposes
-0.151,the example
-0.222,sent for
-1.159,<BIAS>


In [69]:
# Sample Prediction
eli5.show_prediction(logistic_model_tfidf, doc=sentences[0],vec=tfidf_vectorizer, feature_names= tfidf_vectorizer.get_feature_names_out(),top=10)

Contribution?,Feature
1.159,<BIAS>
-0.204,Highlighted in text (sum)
