#### How to generate n-grams using CountVectorizer

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

for n in range(1,4):
    v = CountVectorizer(ngram_range=(1,n))
    v.fit(["Mina is a PhD student and seeking for a job."])
    print('n = ', n)
    print(v.vocabulary_)

n =  1
{'mina': 4, 'is': 2, 'phd': 5, 'student': 7, 'and': 0, 'seeking': 6, 'for': 1, 'job': 3}
n =  2
{'mina': 7, 'is': 4, 'phd': 9, 'student': 13, 'and': 0, 'seeking': 11, 'for': 2, 'job': 6, 'mina is': 8, 'is phd': 5, 'phd student': 10, 'student and': 14, 'and seeking': 1, 'seeking for': 12, 'for job': 3}
n =  3
{'mina': 9, 'is': 5, 'phd': 12, 'student': 18, 'and': 0, 'seeking': 15, 'for': 3, 'job': 8, 'mina is': 10, 'is phd': 6, 'phd student': 13, 'student and': 19, 'and seeking': 1, 'seeking for': 16, 'for job': 4, 'mina is phd': 11, 'is phd student': 7, 'phd student and': 14, 'student and seeking': 20, 'and seeking for': 2, 'seeking for job': 17}



We take a simple collection of text documents, preprocess them by removing stop words, lematize etc and then apply Bag of n-grams, having different values for n.

In [11]:
Texts = [
    "Mina Likes Tahdig",
    "Mina ate Pizza",
    "Yvan is tall"
]

In [12]:
import spacy

# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm") 

def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens) 

In [14]:
preprocess("Mina ate tahdig, and also Pizza")

'Mina eat tahdig Pizza'

In [15]:
text_processed = [
    preprocess(text) for text in Texts
]
text_processed

['Mina like Tahdig', 'Mina eat Pizza', 'Yvan tall']

In [18]:
# Apply Bag of n gram when n = 2 , and train the model 
v = CountVectorizer(ngram_range=(1,2))
v.fit(text_processed)
v.vocabulary_

{'mina': 4,
 'like': 2,
 'tahdig': 8,
 'mina like': 6,
 'like tahdig': 3,
 'eat': 0,
 'pizza': 7,
 'mina eat': 5,
 'eat pizza': 1,
 'yvan': 10,
 'tall': 9,
 'yvan tall': 11}

In [19]:
# Generate a bag of n-grams vector for a sample 
v.transform(["Mina eat pizza"]).toarray()

array([[1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0]])

In [21]:
# Let's take a document that has out of vocabulary (OOV) term and see how bag of ngram generates vector out of it
v.transform(["Mona eat pizza"]).toarray()

array([[1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]])

## Apply Bag of n-grams to pre-process the text and then apply different classification algorithms.

In [None]:
import pandas as pd

df = pd.read_json('news_dataset.json')
print(df.shape)

df.head()

In [None]:
df.category.value_counts()

In [None]:
min_samples = 1381 # we have these many SCIENCE articles and SCIENCE is our minority class


df_business = df[df.category=="BUSINESS"].sample(min_samples, random_state=2022)
df_sports = df[df.category=="SPORTS"].sample(min_samples, random_state=2022)
df_crime = df[df.category=="CRIME"].sample(min_samples, random_state=2022)
df_science = df[df.category=="SCIENCE"].sample(min_samples, random_state=2022)

In [None]:
df_balanced = pd.concat([df_business,df_sports,df_crime,df_science],axis=0)
df_balanced.category.value_counts()

In [None]:
target = {'BUSINESS': 0, 'SPORTS': 1, 'CRIME': 2, 'SCIENCE': 3}

df_balanced['category_num'] = df_balanced['category'].map({
    'BUSINESS': 0,
    'SPORTS': 1, 
    'CRIME': 2, 
    'SCIENCE': 3
})

In [None]:
df_balanced.head()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_balanced.text, 
    df_balanced.category_num, 
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=df_balanced.category_num
)

In [None]:
print(X_train.shape)
X_train.head()

In [None]:
y_train.value_counts()

In [None]:
y_test.value_counts()

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

#1. create a pipeline object
clf = Pipeline([
     ('vectorizer_bow', CountVectorizer(ngram_range = (1, 1))),        #using the ngram_range parameter 
     ('Multi NB', MultinomialNB())         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

In [None]:
X_test[:5]

In [None]:
y_pred[:5]

In [None]:
y_test[:5]

### Use Preprocessing and see how results are changing

In [None]:
df_balanced['preprocessed_txt'] = df_balanced['text'].apply(preprocess) 

In [None]:
df_balanced.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df_balanced.preprocessed_txt, 
    df_balanced.category_num, 
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=df_balanced.category_num
)

In [None]:
print(X_train.shape)
X_train.head()

In [None]:
y_train.value_counts()

In [None]:
y_test.value_counts()

In [None]:
#1. create a pipeline object
clf = Pipeline([
    ('vectorizer_bow', CountVectorizer(ngram_range = (1, 2))),        #using the ngram_range parameter 
    ('Multi NB', MultinomialNB())
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

In [22]:
from matplotlib import pyplot as plt
import seaborn as sn
plt.figure(figsize = (10,7))
sn.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Prediction')
plt.ylabel('Truth')

ModuleNotFoundError: No module named 'matplotlib'

## Applying the above method to fake-and-real-news-dataset

In [23]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("clmentbisaillon/fake-and-real-news-dataset")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/clmentbisaillon/fake-and-real-news-dataset?dataset_version_number=1...


100%|█████████████████████████████████████████████████████████████████████████████| 41.0M/41.0M [00:04<00:00, 10.5MB/s]

Extracting files...





Path to dataset files: C:\Users\minaa\.cache\kagglehub\datasets\clmentbisaillon\fake-and-real-news-dataset\versions\1


In [23]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("clmentbisaillon/fake-and-real-news-dataset")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/clmentbisaillon/fake-and-real-news-dataset?dataset_version_number=1...


100%|█████████████████████████████████████████████████████████████████████████████| 41.0M/41.0M [00:04<00:00, 10.5MB/s]

Extracting files...





Path to dataset files: C:\Users\minaa\.cache\kagglehub\datasets\clmentbisaillon\fake-and-real-news-dataset\versions\1


In [29]:
import pandas as pd
import os

#  Read both CSV files
fake_df = pd.read_csv(os.path.join(path, 'Fake.csv'))
true_df = pd.read_csv(os.path.join(path, 'True.csv'))

#  Add a 'label' column to distinguish them
fake_df['label'] = 'FAKE'
true_df['label'] = 'REAL'

# Merge the datasets
df = pd.concat([fake_df, true_df], ignore_index=True)

# Check the shape and preview
print(df.shape)
df.head()

(44898, 5)


Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",FAKE
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",FAKE
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",FAKE
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",FAKE
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",FAKE


In [30]:
#check the distribution of labels 
df['label'].value_counts()

label
FAKE    23481
REAL    21417
Name: count, dtype: int64

In [31]:
#Add the new column which gives a unique number to each of these labels 

df['label_num'] = df['label'].map({'FAKE' : 0, 'REAL': 1})

#check the results with top 5 rows
df.head(5)

Unnamed: 0,title,text,subject,date,label,label_num
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",FAKE,0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",FAKE,0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",FAKE,0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",FAKE,0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",FAKE,0


In [34]:
# Modeling without preprocessing 
from sklearn.model_selection import train_test_split


#Do the 'train-test' splitting with test size of 20% with random state of 2022 and stratify sampling too
X_train, X_test, y_train, y_test = train_test_split(
    df.text, 
    df.label_num, 
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=df.label_num
)

In [35]:
#print the shapes of X_train and X_test

print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)

Shape of X_train:  (35918,)
Shape of X_test:  (8980,)


In [37]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from  sklearn.neighbors import KNeighborsClassifier

#1. create a pipeline object
clf = Pipeline([
    ('vectorizer_trigrams', CountVectorizer(ngram_range = (1, 3))),                   #using the ngram_range parameter 
     ('KNN', (KNeighborsClassifier(n_neighbors=10, metric = 'euclidean')))           #using the KNN classifier with 10 neighbors and euclidean distance      
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.83      0.79      4696
           1       0.79      0.70      0.74      4284

    accuracy                           0.77      8980
   macro avg       0.77      0.77      0.77      8980
weighted avg       0.77      0.77      0.77      8980



In [38]:
from sklearn.ensemble import RandomForestClassifier


#1. create a pipeline object
clf = Pipeline([
    ('vectorizer_n_grams', CountVectorizer(ngram_range = (3, 3))),                       #using the ngram_range parameter 
    ('random_forest', (RandomForestClassifier()))         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.98      0.97      4696
           1       0.97      0.97      0.97      4284

    accuracy                           0.97      8980
   macro avg       0.97      0.97      0.97      8980
weighted avg       0.97      0.97      0.97      8980



In [None]:
from sklearn.naive_bayes import MultinomialNB


#1. create a pipeline object
clf = Pipeline([
    ('vectorizer_trigrams', CountVectorizer(ngram_range = (1, 2))),        #using the ngram_range parameter 
     ('Multi NB', MultinomialNB(alpha = 0.75))         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

Using Preprocessing and doing the same experiments 

In [40]:
#use this utility function to get the preprocessed text data

import spacy

# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm") 

def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens) 

In [41]:
# create a new column "preprocessed_txt" and use the utility function above to get the clean data
# this will take some time, please be patient
df['preprocessed_txt'] = df['Text'].apply(preprocess) 

KeyError: 'Text'

In [None]:
#print the top 5 rows
df.head()

In [None]:
#Do the 'train-test' splitting with test size of 20% with random state of 2022 and stratify sampling too
#Note: Make sure to use only the "preprocessed_txt" column for splitting

X_train, X_test, y_train, y_test = train_test_split(
    df.preprocessed_txt, 
    df.label_num,
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=df.label_num
)

In [None]:
#1. create a pipeline object
clf = Pipeline([
    ('vectorizer_n_grams', CountVectorizer(ngram_range = (3, 3))),                       #using the ngram_range parameter 
    ('random_forest', (RandomForestClassifier()))         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

In [None]:
#1. create a pipeline object
clf = Pipeline([
    ('vectorizer_n_grams', CountVectorizer(ngram_range = (1, 3))),                       #using the ngram_range parameter 
    ('random_forest', (RandomForestClassifier()))         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

In [None]:
#finally print the confusion matrix for the best model

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm


from matplotlib import pyplot as plt
import seaborn as sn
plt.figure(figsize = (10,7))
sn.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Prediction')
plt.ylabel('Truth')