In [4]:
import spacy
import numpy as np
import pandas as pd
import json

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [6]:
data = []
with open("/content/News_Category_Dataset_v3.json","r") as f:
  for line in f:
    data.append(json.loads(line))

df = pd.DataFrame(data)
df = df[['headline','category']]

In [7]:
df

Unnamed: 0,headline,category
0,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS
1,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS
2,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY
3,The Funniest Tweets From Parents This Week (Se...,PARENTING
4,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS
...,...,...
209522,RIM CEO Thorsten Heins' 'Significant' Plans Fo...,TECH
209523,Maria Sharapova Stunned By Victoria Azarenka I...,SPORTS
209524,"Giants Over Patriots, Jets Over Colts Among M...",SPORTS
209525,Aldon Smith Arrested: 49ers Linebacker Busted ...,SPORTS


In [8]:
categories = ['TECHNOLOGY','ENTERTAINMENT','POLITICS','BUSINESS']
df = df[df['category'].isin(categories)]

In [9]:
nlp = spacy.load("en_core_web_sm")

In [10]:
nlp

<spacy.lang.en.English at 0x7bf909adf7d0>

In [11]:
def preprocess(text):
  doc = nlp(text.lower())
  tokens = [
      token.lemma_ for token in doc
      if not token.is_stop and not token.is_punct
  ]
  return " ".join(tokens)

In [12]:
df['clean_headline'] = df['headline'].apply(preprocess)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clean_headline'] = df['headline'].apply(preprocess)


In [13]:
vectorizer = CountVectorizer(
    ngram_range=(1,2),
    max_features=5000
)

X = vectorizer.fit_transform(df['clean_headline'])
y = df['category']

In [15]:
y

Unnamed: 0,category
20,ENTERTAINMENT
21,POLITICS
24,POLITICS
28,ENTERTAINMENT
30,POLITICS
...,...
209509,BUSINESS
209510,BUSINESS
209511,BUSINESS
209512,ENTERTAINMENT


In [16]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [17]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train,y_train)

In [18]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
print("Accuracy:",accuracy)

Accuracy: 0.8956071913161465


In [19]:
def predict_category(headline):
  clean = preprocess(headline)
  vector = vectorizer.transform([clean])
  return model.predict(vector)[0]

In [20]:
print("NewsBot Started")

while True:
  text = input("Enter a news headline: ")
  if text.lower() in ['exit','break','quit']:
    break
  category = predict_category(text)
  print("Predicted Category:",category)

NewsBot Started
Enter a news headline: big game In Parliament
Predicted Category: ENTERTAINMENT
Enter a news headline: Scam of 154876 rupees!!!
Predicted Category: POLITICS
Enter a news headline: Gold and Silver price went up again
Predicted Category: POLITICS
Enter a news headline: Tata birla went up
Predicted Category: POLITICS
Enter a news headline: ad adsgjfaf
Predicted Category: BUSINESS
Enter a news headline: Adani
Predicted Category: POLITICS
Enter a news headline: quit
