In [1]:
from sklearn.feature_extraction.text import CountVectorizer

v= CountVectorizer(ngram_range=(1,3))
#generate 1 gram, Bigram and tri-gram
v.fit(["Mami Papi Becho is looking for a Job"])
v.vocabulary_

{'mami': 12,
 'papi': 15,
 'becho': 0,
 'is': 5,
 'looking': 9,
 'for': 3,
 'job': 8,
 'mami papi': 13,
 'papi becho': 16,
 'becho is': 1,
 'is looking': 6,
 'looking for': 10,
 'for job': 4,
 'mami papi becho': 14,
 'papi becho is': 17,
 'becho is looking': 2,
 'is looking for': 7,
 'looking for job': 11}

In [3]:
corpus = [
    "Thor ate pizza",
    "Loki is tall",
    "Loki is eating pizza"
]

In [4]:
import spacy

nlp= spacy.load("en_core_web_sm")

def preprocess(text):
    doc= nlp(text)
    filtered_tokens= []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens)

In [5]:
preprocess("Mariam ate Mango and went to the market")

'Mariam eat Mango go market'

In [6]:
corpus_processed= [preprocess(text) for text in corpus]
corpus_processed

['thor eat pizza', 'Loki tall', 'Loki eat pizza']

In [7]:
v= CountVectorizer(ngram_range=(1,2))
v.fit(corpus_processed)
v.vocabulary_

{'thor': 7,
 'eat': 0,
 'pizza': 5,
 'thor eat': 8,
 'eat pizza': 1,
 'loki': 2,
 'tall': 6,
 'loki tall': 4,
 'loki eat': 3}

In [8]:
#using Bag of n grams models transform text into vector
v.transform(["Thor eat Pizza"]).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 1, 1]], dtype=int64)

In [9]:
v.transform(["Mariam eat Pizza"]).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 0, 0]], dtype=int64)

In [10]:
import pandas as pd

df= pd.read_json("News_Category.json", lines=True)
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [11]:
df.category.value_counts()

POLITICS          35602
WELLNESS          17945
ENTERTAINMENT     17362
TRAVEL             9900
STYLE & BEAUTY     9814
PARENTING          8791
HEALTHY LIVING     6694
QUEER VOICES       6347
FOOD & DRINK       6340
BUSINESS           5992
COMEDY             5400
SPORTS             5077
BLACK VOICES       4583
HOME & LIVING      4320
PARENTS            3955
THE WORLDPOST      3664
WEDDINGS           3653
WOMEN              3572
CRIME              3562
IMPACT             3484
DIVORCE            3426
WORLD NEWS         3299
MEDIA              2944
WEIRD NEWS         2777
GREEN              2622
WORLDPOST          2579
RELIGION           2577
STYLE              2254
SCIENCE            2206
TECH               2104
TASTE              2096
MONEY              1756
ARTS               1509
ENVIRONMENT        1444
FIFTY              1401
GOOD NEWS          1398
U.S. NEWS          1377
ARTS & CULTURE     1339
COLLEGE            1144
LATINO VOICES      1130
CULTURE & ARTS     1074
EDUCATION       

In [12]:
df.category.unique()

array(['U.S. NEWS', 'COMEDY', 'PARENTING', 'WORLD NEWS', 'CULTURE & ARTS',
       'TECH', 'SPORTS', 'ENTERTAINMENT', 'POLITICS', 'WEIRD NEWS',
       'ENVIRONMENT', 'EDUCATION', 'CRIME', 'SCIENCE', 'WELLNESS',
       'BUSINESS', 'STYLE & BEAUTY', 'FOOD & DRINK', 'MEDIA',
       'QUEER VOICES', 'HOME & LIVING', 'WOMEN', 'BLACK VOICES', 'TRAVEL',
       'MONEY', 'RELIGION', 'LATINO VOICES', 'IMPACT', 'WEDDINGS',
       'COLLEGE', 'PARENTS', 'ARTS & CULTURE', 'STYLE', 'GREEN', 'TASTE',
       'HEALTHY LIVING', 'THE WORLDPOST', 'GOOD NEWS', 'WORLDPOST',
       'FIFTY', 'ARTS', 'DIVORCE'], dtype=object)

In [20]:

import numpy as np
from sklearn.utils import resample

def undersample_category(df, target_size):
    # Create a dictionary to store the original counts for each category
    orig_counts = dict(df['category'].value_counts())
    
    # Determine the minimum count among all categories
    min_count = min(orig_counts.values())
    
    # Calculate the number of samples to draw from each category
    samples_per_category = {cat: min(target_size, count) for cat, count in orig_counts.items()}
    
    # Create an empty DataFrame to hold the undersampled data
    undersampled_df = pd.DataFrame(columns=df.columns)
    
    # For each category, draw a random sample of size "samples_per_category" and add it to the undersampled DataFrame
    for cat, count in samples_per_category.items():
        cat_df = df[df['category'] == cat]
        cat_sample = resample(cat_df, n_samples=count, replace=False, random_state=42)
        undersampled_df = pd.concat([undersampled_df, cat_sample], axis=0)
    
    # Shuffle the undersampled DataFrame and return it
    return undersampled_df.sample(frac=1, random_state=42)


In [27]:
undersampled_df = undersample_category(df, target_size=1000)


In [28]:
import pandas as pd
import numpy as np
from sklearn.utils import resample

def undersample_category(df, target_size):
    # Create a dictionary to store the original counts for each category
    orig_counts = dict(df['category'].value_counts())
    
    # Determine the minimum count among all categories
    min_count = min(orig_counts.values())
    
    # Calculate the number of samples to draw from each category
    samples_per_category = {cat: min(target_size, count) for cat, count in orig_counts.items()}
    
    # Create a mapping from category names to numeric labels starting from 0
    label_map = {cat: i for i, cat in enumerate(sorted(df['category'].unique()))}
    
    # Add a new column to the DataFrame with the numeric label for each category
    df['category_label'] = df['category'].map(label_map)
    
    # Create an empty DataFrame to hold the undersampled data
    undersampled_df = pd.DataFrame(columns=df.columns)
    
    # For each category, draw a random sample of size "samples_per_category" and add it to the undersampled DataFrame
    for cat, count in samples_per_category.items():
        cat_df = df[df['category'] == cat]
        cat_sample = resample(cat_df, n_samples=count, replace=False, random_state=42)
        undersampled_df = pd.concat([undersampled_df, cat_sample], axis=0)
    
    # Shuffle the undersampled DataFrame and return it, dropping the original category column
    return undersampled_df.drop('category', axis=1).sample(frac=1, random_state=42)


In [30]:
undersampled_df = undersample_category(df, target_size=1000)
undersampled_df

Unnamed: 0,link,headline,short_description,authors,date,category_label
181369,https://www.huffingtonpost.comhttp://www.cnn.c...,The Secret Life Of My Sixth Grader,My sixth grader has a secret life online. It w...,,2012-11-27,22
105872,https://www.huffingtonpost.com/entry/how-hasht...,How Hashtags Evolved And Changed The Way We Co...,We've already pretty much streamlined the comm...,"Uloop, ContributorOnline Marketplace for Colle...",2015-03-04,4
117506,https://www.huffingtonpost.com/entry/mexicos-f...,Mexico's Foreign Minister Pushes for UNSC Refo...,Addressing a public program in New Delhi today...,"Samarth Pathak, ContributorPublic advocacy spe...",2014-10-21,41
193726,https://www.huffingtonpost.com/entry/troy-dono...,"Troy Donovan, Family Reclaim Colorado Home Fro...",Having a place to live has become more difficu...,Bonnie Kavoussi,2012-07-16,21
93638,https://www.huffingtonpost.comhttp://latino.fo...,Nobel Prize Winners Demand Better Health Care ...,"Six Nobel Prize winners, led by America's Jody...",,2015-07-22,19
...,...,...,...,...,...,...
119336,https://www.huffingtonpost.com/entry/4-ways-to...,4 Ways to Support Farm-to-School Policies,,"Maria Rodale, ContributorCEO and Chairman of R...",2014-09-30,16
130090,https://www.huffingtonpost.com/entry/florida-h...,WATCH: College Baseball Team Pulls Off Hidden ...,,Michael Klopman,2014-05-31,28
66521,https://www.huffingtonpost.com/entry/lake-shas...,University Of Oregon Fraternity Suspended Afte...,"""My personal guess is they have no respect for...",Ed Mazza,2016-05-25,4
35231,https://www.huffingtonpost.com/entry/trump-ass...,Trump Assures Farmers Immigration Crackdown Wo...,“He assured us we would have plenty of access ...,"Mica Rosenberg and Kristina Cooke, Reuters",2017-05-15,24


In [32]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    undersampled_df.headline, 
    undersampled_df.category_label, 
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=undersampled_df.category_label
)


In [33]:
print(X_train.shape)
X_train.head()

(33600,)


129442    The Global Search for Education: Which Digital...
205618    Sea Shepherd Declares Victory: Activists' 'Sab...
138611             Creating Connections With Nature Via Art
48947     Why Car Rental Customers Are Losing It Over 'L...
122605                         Learning To Be A College Mom
Name: headline, dtype: object

In [34]:
y_train.value_counts()

9     800
18    800
0     800
40    800
29    800
22    800
20    800
6     800
41    800
23    800
33    800
11    800
10    800
36    800
17    800
13    800
1     800
35    800
26    800
34    800
24    800
38    800
14    800
27    800
7     800
3     800
12    800
8     800
37    800
31    800
4     800
25    800
21    800
2     800
15    800
32    800
19    800
28    800
39    800
5     800
30    800
16    800
Name: category_label, dtype: int64

In [35]:
y_test.value_counts()

41    200
30    200
22    200
21    200
1     200
14    200
13    200
26    200
5     200
0     200
7     200
2     200
11    200
10    200
16    200
34    200
37    200
20    200
31    200
9     200
18    200
4     200
15    200
29    200
27    200
24    200
36    200
12    200
38    200
3     200
6     200
32    200
8     200
17    200
25    200
23    200
28    200
39    200
33    200
35    200
19    200
40    200
Name: category_label, dtype: int64

In [36]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

#1. create a pipeline object
clf = Pipeline([
     ('vectorizer_bow', CountVectorizer(ngram_range = (1, 1))),        #using the ngram_range parameter 
     ('Multi NB', MultinomialNB())         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

ValueError: Unknown label type: (array([9, 11, 7, ..., 23, 26, 29], dtype=object),)