## News Category prediction:

In [6]:
# importing necessary libraries...
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# for splitting the data...
from sklearn.model_selection import train_test_split 

# for modelling...
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,BaggingClassifier,AdaBoostClassifier
from xgboost import XGBClassifier

# for evaluate the model...
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

# for ignoring warnings...
import warnings 
warnings.filterwarnings('ignore')

In [7]:
# load the data  news categories data...
news = pd.read_csv('NewsCategorizer.csv')
news.head(5)

Unnamed: 0,category,headline,links,short_description,keywords
0,WELLNESS,143 Miles in 35 Days: Lessons Learned,https://www.huffingtonpost.com/entry/running-l...,Resting is part of training. I've confirmed wh...,running-lessons
1,WELLNESS,Talking to Yourself: Crazy or Crazy Helpful?,https://www.huffingtonpost.com/entry/talking-t...,Think of talking to yourself as a tool to coac...,talking-to-yourself-crazy
2,WELLNESS,Crenezumab: Trial Will Gauge Whether Alzheimer...,https://www.huffingtonpost.com/entry/crenezuma...,The clock is ticking for the United States to ...,crenezumab-alzheimers-disease-drug
3,WELLNESS,"Oh, What a Difference She Made",https://www.huffingtonpost.com/entry/meaningfu...,"If you want to be busy, keep trying to be perf...",meaningful-life
4,WELLNESS,Green Superfoods,https://www.huffingtonpost.com/entry/green-sup...,"First, the bad news: Soda bread, corned beef a...",green-superfoods


In [8]:
news.shape

(50000, 5)

In [9]:
news.category.value_counts()

WELLNESS          5000
POLITICS          5000
ENTERTAINMENT     5000
TRAVEL            5000
STYLE & BEAUTY    5000
PARENTING         5000
FOOD & DRINK      5000
WORLD NEWS        5000
BUSINESS          5000
SPORTS            5000
Name: category, dtype: int64

In [10]:
news.isna().sum()

category                0
headline                0
links                   0
short_description       0
keywords             2668
dtype: int64

In [11]:
news[news['keywords'].isna()==True]

Unnamed: 0,category,headline,links,short_description,keywords
51,WELLNESS,Hotels Let Guests Sleep In On Sundays,https://www.huffingtonpost.comhttp://www.usato...,Hotels are encouraging their guests to sleep i...,
57,WELLNESS,Auriculotherapy: Penelope Cruz Sports Acupunct...,https://www.huffingtonpost.comhttp://abcnews.g...,The swath of tiny studs in Penelope Cruz's ear...,
59,WELLNESS,Hormone Therapy 'Not Recommended' By Governmen...,https://www.huffingtonpost.comhttp://www.cnn.c...,The U.S. Preventive Services Task Force says m...,
67,WELLNESS,7 Ways To Fall Asleep Faster,https://www.huffingtonpost.comhttp://www.daily...,Can't get enough z's? Try these tips so you ca...,
101,WELLNESS,Doctors Say Changes In Wheat Do Not Explain Ri...,https://www.huffingtonpost.comhttp://www.npr.o...,It's true that about 40 years ago wheat breede...,
...,...,...,...,...,...
49971,SPORTS,Oakland Athletics Vs. Detroit Tigers: ALDS Gam...,https://www.huffingtonpost.comhttp://bleacherr...,Game 1 of the American League Divisional Serie...,
49980,SPORTS,Tiger Woods On Turning 40 And His Private Stru...,https://www.huffingtonpost.comhttp://pubx.co/k...,Tiger Woods was raised to be a champion. Groom...,
49982,SPORTS,100 Hottest NFL Cheerleaders | Bleacher Report,https://www.huffingtonpost.comhttp://bleacherr...,Bringing you the 100 Hottest Cheerleaders in t...,
49994,SPORTS,"Tiger Woods And Rory McIlroy, Honda Classic Li...",https://www.huffingtonpost.comhttp://golfweek....,Tiger Woods is looking to springboard off a 5-...,


In [12]:
news['label'] = news['category'].map({'WELLNESS':0, 'POLITICS':1, 'ENTERTAINMENT':2, 'TRAVEL':3,
       'STYLE & BEAUTY':4, 'PARENTING':5, 'FOOD & DRINK':6, 'WORLD NEWS':7,
       'BUSINESS':8, 'SPORTS':9})

In [13]:
news.head()

Unnamed: 0,category,headline,links,short_description,keywords,label
0,WELLNESS,143 Miles in 35 Days: Lessons Learned,https://www.huffingtonpost.com/entry/running-l...,Resting is part of training. I've confirmed wh...,running-lessons,0
1,WELLNESS,Talking to Yourself: Crazy or Crazy Helpful?,https://www.huffingtonpost.com/entry/talking-t...,Think of talking to yourself as a tool to coac...,talking-to-yourself-crazy,0
2,WELLNESS,Crenezumab: Trial Will Gauge Whether Alzheimer...,https://www.huffingtonpost.com/entry/crenezuma...,The clock is ticking for the United States to ...,crenezumab-alzheimers-disease-drug,0
3,WELLNESS,"Oh, What a Difference She Made",https://www.huffingtonpost.com/entry/meaningfu...,"If you want to be busy, keep trying to be perf...",meaningful-life,0
4,WELLNESS,Green Superfoods,https://www.huffingtonpost.com/entry/green-sup...,"First, the bad news: Soda bread, corned beef a...",green-superfoods,0


In [14]:
import regex as re
from bs4 import BeautifulSoup
import spacy
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import re
import unicodedata

nlp = spacy.load("en_core_web_sm")


def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text


# Cleaning data
def normalize_corpus(corpus, contraction_expansion=True,
                     accented_char_removal=True, text_lower_case=True, 
                     text_lemmatization=True, special_char_removal=True, 
                      remove_digits=True):
    
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        # remove accented characters
        if accented_char_removal:
            doc = remove_accented_chars(doc)
        # lowercase the text    
        if text_lower_case:
            doc = doc.lower()
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # lemmatize text
        if text_lemmatization:
            doc = lemmatize_text(doc)
        # remove special characters and\or digits    
        if special_char_removal:
            # insert spaces between special characters to isolate them    
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits)  
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        
            
        normalized_corpus.append(doc)
        
    return normalized_corpus

news['text'] = news['headline'].map(str)+ '. ' +news['short_description']
news['clean text'] = normalize_corpus(news['text'],remove_digits=False)

In [15]:
# defining the x and y
x = news['clean text']
y = news.label

In [16]:
x[2]

'crenezumab trial will gauge whether alzheimer s drug can prevent or slow the disease the clock be tick for the united states to find a cure the team be work on the study with dr francisco lopera of'

In [17]:
# spliting the data for modelling purpose...
trainx , testx , trainy , testy = train_test_split(x,y,random_state=10)

In [18]:
# checking shape of the splitted data...
trainx.shape, testx.shape , trainy.shape , testy.shape

((37500,), (12500,), (37500,), (12500,))

In [19]:
# use count vectoriser technique to create the dtm for train and test data...
vect = CountVectorizer()
trainx_dtm = vect.fit_transform(trainx)
testx_dtm = vect.transform(testx)
trainx_dtm.shape,testx_dtm.shape

((37500, 36118), (12500, 36118))

In [20]:
# checking the last 50 features names...
vect.get_feature_names_out()[-50:]

array(['zocdoc', 'zoe', 'zoellick', 'zoey', 'zolpidem', 'zoltan',
       'zoltar', 'zombie', 'zomnir', 'zone', 'zong', 'zoo', 'zoodle',
       'zooey', 'zookeeper', 'zoolander', 'zoom', 'zootopia', 'zor',
       'zosia', 'zoubeir', 'zouhour', 'zoukis', 'zsa', 'zubairy',
       'zubaydah', 'zubaydeh', 'zuberi', 'zuburbia', 'zucchini',
       'zucchinis', 'zuck', 'zucker', 'zuckerberg', 'zulily', 'zulu',
       'zuma', 'zumba', 'zuravleff', 'zurich', 'zurlon', 'zusak',
       'zuzana', 'zuzu', 'zwigoff', 'zylka', 'zymurgy', 'zynga', 'zyola',
       'zywicki'], dtype=object)

In [21]:
trainx_dtm.shape

(37500, 36118)

In [22]:
# use the same technique but lets take one hyperparameter lowercase = false to not to convert to lowercase...
vect = CountVectorizer(lowercase=False)
trainx_dtm_NL = vect.fit_transform(trainx)
testx_dtm_NL = vect.transform(testx)

In [23]:
trainx_dtm_NL.shape

(37500, 36118)

In [24]:
# lets include 1-ngrams and 2 ngrams...
vect = CountVectorizer(ngram_range=(1,2))
trainx_dtm_ngm = vect.fit_transform(trainx)
testx_dtm_ngm = vect.transform(testx)

In [25]:
trainx_dtm_ngm.shape

(37500, 481824)

In [26]:
vect.get_feature_names_out()[-50:]

array(['zuckerberg post', 'zuckerberg say', 'zuckerberg speak',
       'zuckerberg to', 'zuckerberg wife', 'zulily', 'zulily qa', 'zulu',
       'zulu african', 'zuma', 'zuma from', 'zuma resign', 'zuma south',
       'zumba', 'zumba class', 'zumba meet', 'zumba your', 'zuravleff',
       'zuravleff show', 'zurich', 'zurich be', 'zurich beautiful',
       'zurich chess', 'zurich local', 'zurich look', 'zurich mosque',
       'zurich name', 'zurich on', 'zurich recommend', 'zurich the',
       'zurlon', 'zurlon tipton', 'zusak', 'zusak much', 'zuzana',
       'zuzana navelkova', 'zuzu', 'zuzu joyfully', 'zwigoff',
       'zwigoff claim', 'zylka', 'zylka um', 'zymurgy',
       'zymurgy magazine', 'zynga', 'zynga ceo', 'zyola', 'zyola mix',
       'zywicki', 'zywicki be'], dtype=object)

In [27]:
# use naive bayes to predict the news category and evaluate the models accuracy...
model = MultinomialNB()
model.fit(trainx_dtm_ngm,trainy)
pred_class = model.predict(testx_dtm_ngm)
accuracy_score(testy,pred_class)

0.77976

In [28]:
# predicted categories...
pred_class

array([5, 7, 0, ..., 4, 1, 8], dtype=int64)

In [29]:
# calculate null accuracy...
y_test_binary = np.where(testy==5, 1, 0)
max(y_test_binary.mean(), 1 - y_test_binary.mean())

0.89952

In [30]:
# lets create a function which transform the data into dtm and do modelling and give the accuracy...
def tokenise_test(vect):
    trainx_dtm = vect.fit_transform(trainx)
    print('Features:',trainx_dtm.shape[1])
    testx_dtm = vect.transform(testx)
    nb = MultinomialNB()
    nb.fit(trainx_dtm,trainy)
    pred_class = nb.predict(testx_dtm)
    print('Accuracy:',accuracy_score(testy,pred_class))

In [31]:
# include 1-grams and 1-grams
tokenise_test(CountVectorizer(ngram_range=(1,1)))

Features: 36118
Accuracy: 0.79088


In [32]:
# include 1-grams and 2-grams
tokenise_test(CountVectorizer(ngram_range=(1,2)))

Features: 481824
Accuracy: 0.77976


In [33]:
# include 1-grams and 1-grams and lower = false...
tokenise_test(CountVectorizer(ngram_range=(1,1),lowercase=False))

Features: 36118
Accuracy: 0.79088


In [34]:
# remove stopwords...
vect = CountVectorizer(stop_words='english')

In [35]:
# set of stop words...
print(vect.get_stop_words())

frozenset({'neither', 'yourself', 'anywhere', 'top', 'he', 'yourselves', 'along', 'bottom', 'side', 'name', 'found', 'sometimes', 'both', 'couldnt', 'too', 'rather', 'among', 'whole', 'still', 'their', 'hasnt', 'co', 'this', 'everything', 'ourselves', 'there', 'been', 'fill', 'eight', 'next', 'moreover', 'myself', 'someone', 'move', 'thus', 'formerly', 'cant', 'however', 'six', 'onto', 'anyway', 'so', 'during', 'part', 'where', 'became', 'alone', 'these', 'elsewhere', 'due', 'must', 'well', 'behind', 'mill', 'bill', 'describe', 'everyone', 'my', 'whither', 'could', 'for', 'toward', 'everywhere', 'over', 'has', 'therefore', 'without', 'from', 'thru', 'twelve', 'because', 'eleven', 'least', 'being', 'sincere', 'beside', 'serious', 'done', 'via', 'his', 'will', 'meanwhile', 'own', 'third', 'amount', 'had', 'they', 'always', 'ours', 'down', 'am', 'perhaps', 'detail', 'thence', 'former', 'a', 'how', 'four', 'fifty', 'nine', 'some', 'interest', 'somehow', 'becoming', 'etc', 'take', 'somewher

In [36]:
# use hyperparameter stop_words='english'
tokenise_test(CountVectorizer(stop_words='english'))

Features: 35826
Accuracy: 0.8036


In [37]:
# without hyperparameter stop_words...
tokenise_test(CountVectorizer())

Features: 36118
Accuracy: 0.79088


In [38]:
# use all three hyperparameter which we did earlier...
tokenise_test(CountVectorizer(stop_words='english',ngram_range=(1,2),lowercase=False))

Features: 467531
Accuracy: 0.8224


In [39]:
# remove English stop words and only keep 100 features(max_features=100)
tokenise_test(CountVectorizer(stop_words='english',max_features=100))

Features: 100
Accuracy: 0.41624


In [40]:
tokenise_test(CountVectorizer(stop_words='english',max_features=2000))

Features: 2000
Accuracy: 0.7396


In [41]:
tokenise_test(CountVectorizer(stop_words='english',max_features=3000))

Features: 3000
Accuracy: 0.75704


In [42]:
tokenise_test(CountVectorizer(stop_words='english',max_features=6000))

Features: 6000
Accuracy: 0.78712


In [43]:
# include 1-grams and 2-grams, and only include terms that appear at least 2 times
tokenise_test(CountVectorizer(stop_words='english',max_df=2))

Features: 19958
Accuracy: 0.226


In [44]:
tokenise_test(CountVectorizer(stop_words='english',min_df=4))

Features: 13193
Accuracy: 0.79888


In [45]:
tokenise_test(CountVectorizer(stop_words='english',min_df=5))

Features: 11442
Accuracy: 0.79784


In [46]:
tokenise_test(CountVectorizer(stop_words='english',min_df=6))

Features: 10147
Accuracy: 0.79496


In [47]:
tokenise_test(CountVectorizer(stop_words='english',min_df=7))

Features: 9165
Accuracy: 0.7932


In [48]:
### Let's use tf-idf vectoriser...

In [49]:
# use tf-idf with stop words...
tokenise_test(TfidfVectorizer(stop_words='english'))

Features: 35826
Accuracy: 0.80312


In [50]:
# lets use the updated stopwords which we created earlier...
tokenise_test(TfidfVectorizer(stop_words='english'))

Features: 35826
Accuracy: 0.80312


In [51]:
vect = CountVectorizer(stop_words='english',ngram_range=(1,2),lowercase=False)
trainx_dtm = vect.fit_transform(trainx)
print('Features:',trainx_dtm.shape[1])
testx_dtm = vect.transform(testx)
nb = MultinomialNB()
nb.fit(trainx_dtm,trainy)
pred_class = nb.predict(testx_dtm)
print('Accuracy:',accuracy_score(testy,pred_class))

Features: 467531
Accuracy: 0.8224


In [52]:
print('Confusion Matrix:\n',confusion_matrix(testy,pred_class))

Confusion Matrix:
 [[ 989   13   11   27   21  111   43    9   43    6]
 [  23  964   11   20    6   34    0   87   85   10]
 [  28   34  934   25   69   74    5   13   24   29]
 [  26    9   21 1037   19   34   36   28   20    7]
 [  36    3   15   26 1090   41   11    3   12    4]
 [ 103   15   23   24   23 1011   17    5   26    9]
 [  59    5   16   70   21   39 1000    1   16    5]
 [  17   58   12   30    6   21    0 1067   60   14]
 [  49   43    7   27    9   40   10   15 1067    9]
 [   7   14   10   11    7   23    4   13   15 1121]]


In [53]:
print('Classification Report:\n',classification_report(testy,pred_class))

Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.78      0.76      1273
           1       0.83      0.78      0.80      1240
           2       0.88      0.76      0.81      1235
           3       0.80      0.84      0.82      1237
           4       0.86      0.88      0.87      1241
           5       0.71      0.80      0.75      1256
           6       0.89      0.81      0.85      1232
           7       0.86      0.83      0.84      1285
           8       0.78      0.84      0.81      1276
           9       0.92      0.92      0.92      1225

    accuracy                           0.82     12500
   macro avg       0.83      0.82      0.82     12500
weighted avg       0.83      0.82      0.82     12500



In [54]:
print('Accuracy Score:',accuracy_score(testy,pred_class))

Accuracy Score: 0.8224


### Deployment Process:

In [60]:
import pickle

In [67]:
with open('Normalisecorpus.pkl','wb') as file:
    pickle.dump(normalize_corpus,file)
print('''Model saved as 'Normalisecorpus.pkl' ''')

Model saved as 'Normalisecorpus.pkl' 


In [63]:
with open('countvectoriser.pkl','wb') as file:
    pickle.dump(vect,file)
print('''Model saved as 'countvectoriser.pkl' ''')

Model saved as 'countvectoriser.pkl' 


In [64]:
with open('modelnb.pkl','wb') as file:
    pickle.dump(nb,file)
print('''Model saved as 'modelnb.pkl' ''')

Model saved as 'modelnb.pkl' 


In [68]:
import gradio as gr
import pickle
import unicodedata
import re
import spacy

# Loading the vectorizer, model, and SpaCy NLP model
with open('countvectoriser.pkl', 'rb') as f:
    vect = pickle.load(f)

with open('modelnb.pkl', 'rb') as f:
    nb = pickle.load(f)

nlp = spacy.load("en_core_web_sm") 

# Category mapping dictionary
categories = {'WELLNESS': 0, 'POLITICS': 1, 'ENTERTAINMENT': 2, 'TRAVEL': 3,
              'STYLE & BEAUTY': 4, 'PARENTING': 5, 'FOOD & DRINK': 6, 'WORLD NEWS': 7,
              'BUSINESS': 8, 'SPORTS': 9}
reverse_categories = {v: k for k, v in categories.items()}

# Text cleaning functions
def remove_accented_chars(text):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    return re.sub(pattern, '', text)

def lemmatize_text(text):
    doc = nlp(text)
    return ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in doc])

# Corpus normalization
def normalize_corpus(corpus, accented_char_removal=True, text_lower_case=True, 
                     text_lemmatization=True, special_char_removal=True, remove_digits=True):
    
    normalized_corpus = []
    for doc in corpus:
        if accented_char_removal:
            doc = remove_accented_chars(doc)
        if text_lower_case:
            doc = doc.lower()
        doc = re.sub(r'[\r|\n|\r\n]+', ' ', doc)
        if text_lemmatization:
            doc = lemmatize_text(doc)
        if special_char_removal:
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits)
        doc = re.sub(' +', ' ', doc)
        normalized_corpus.append(doc)
        
    return normalized_corpus

# Prediction function
def classify_text(text):
    normalized_text = normalize_corpus([text])
    input_vector = vect.transform(normalized_text)
    prediction = nb.predict(input_vector)
    return reverse_categories[prediction[0]]

# Create the Gradio interface
iface = gr.Interface(fn=classify_text, inputs="text", outputs="text", title="Text Classification")

# Launch the interface locally to check
iface.launch()


IMPORTANT: You are using gradio version 4.7.1, however version 4.29.0 is available, please upgrade.
--------
Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.




## Model Deployed :
* **You can try it by clicking the link**
https://huggingface.co/spaces/HarshU1/News-Category-Prediction