# BBC News Classifier

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

In [2]:
#NLTK modules
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## Loading the data

In [3]:

news = pd.read_csv('bbc-news-data.csv', delimiter='\t')
news

Unnamed: 0,category,filename,title,content
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...
...,...,...,...,...
2220,tech,397.txt,BT program to beat dialler scams,BT is introducing two initiatives to help bea...
2221,tech,398.txt,Spam e-mails tempt net shoppers,Computer users across the world continue to i...
2222,tech,399.txt,Be careful how you code,A new European directive could put software w...
2223,tech,400.txt,US cyber security chief resigns,The man making sure US computer networks are ...


In [4]:
news.shape

(2225, 4)

In [5]:
news.drop('filename',axis=1,inplace=True)

In [6]:
news['combined_text']=news['title']+' '+news['content']

news.head()

Unnamed: 0,category,title,content,combined_text
0,business,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...,Ad sales boost Time Warner profit Quarterly p...
1,business,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...,Dollar gains on Greenspan speech The dollar h...
2,business,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...,Yukos unit buyer faces loan claim The owners ...
3,business,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...,High fuel prices hit BA's profits British Air...
4,business,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...,Pernod takeover talk lifts Domecq Shares in U...


In [7]:
news.drop(['title', 'content'], axis=1, inplace=True)

In [8]:
news.head()

Unnamed: 0,category,combined_text
0,business,Ad sales boost Time Warner profit Quarterly p...
1,business,Dollar gains on Greenspan speech The dollar h...
2,business,Yukos unit buyer faces loan claim The owners ...
3,business,High fuel prices hit BA's profits British Air...
4,business,Pernod takeover talk lifts Domecq Shares in U...


## Data Preprocessing

In [None]:
#  Lower Case
#  Tokenization
#  Removing special character
#  Removing stop words and punctuation
#  stemming 

In [10]:
#importing libraies
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()


In [11]:
def transform_text(text):
    text=text.lower()
    text=nltk.word_tokenize(text)
    
    y = []
    for i in text:
        if i.isalnum:
             y.append(i)
    text=y[:]
    y.clear()
    
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
    
    text=y[:]
    y.clear()
    
    for i in text:
        y.append(ps.stem(i))
        
    return " ".join(y)

In [27]:
news['combined_text']=news['combined_text'].apply(transform_text)

In [30]:
news['combined_text'][0]

"ad sale boost time warner profit quarterli profit us media giant timewarn jump 76 1.13bn £600m three month decemb 639m year-earli firm one biggest investor googl benefit sale high-spe internet connect higher advert sale timewarn said fourth quarter sale rose 2 11.1bn 10.9bn profit buoy one-off gain offset profit dip warner bro less user aol time warner said friday own 8 search-engin googl internet busi aol mix fortun lost 464,000 subscrib fourth quarter profit lower preced three quarter howev compani said aol 's underli profit except item rose 8 back stronger internet advertis revenu hope increas subscrib offer onlin servic free timewarn internet custom tri sign aol 's exist custom high-spe broadband timewarn also restat 2000 2003 result follow probe us secur exchang commiss sec close conclud time warner 's fourth quarter profit slightli better analyst expect film divis saw profit slump 27 284m help box-offic flop alexand catwoman sharp contrast year-earli third final film lord ring t

## Model Building

In [36]:
from sklearn.model_selection import train_test_split
x=news['combined_text']
y=news['category']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

## Text Vectorization

In [39]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
tfidf=TfidfVectorizer()

#  Fit and transform the training data to create TF-IDF features
x_train_tfidf=tfidf.fit_transform(x_train)

#  Transform the testing data using the same vectorizer
x_test_tfidf=tfidf.transform(x_test)

In [40]:
#importing libraries
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [42]:
classifier=SVC(kernel='linear')
classifier.fit(x_train_tfidf,y_train)

In [43]:
y_pred=classifier.predict(x_test_tfidf)

In [46]:
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(y_test,y_pred)
print('Accuracy of SVM model:',accuracy)

Accuracy of SVM model: 0.9842696629213483


In [63]:
y_pred[2]

'sport'

In [59]:
y_test

414          business
420          business
1644            sport
416          business
1232         politics
            ...      
741     entertainment
205          business
1102         politics
668     entertainment
479          business
Name: category, Length: 445, dtype: object