In [1]:
# Import required libraries
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tkinter import *
from tkinter import filedialog


In [2]:
df = pd.read_csv("bbc-text.csv")

In [3]:
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [4]:
#data preprocessing
nltk.download('stopwords')
stopwords_english = stopwords.words('english')
stemmer = SnowballStemmer('english')
def preprocess(text):
    text = text.lower()
    text = " ".join([word for word in text.split() if word not in stopwords_english])
    text = " ".join([stemmer.stem(word) for word in text.split()])
    return text

df['text'] = df['text'].apply(preprocess)


[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


In [5]:
# convert text into a bag of words
count_vectorizer = CountVectorizer()
X = count_vectorizer.fit_transform(df['text'])


In [6]:
# split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, df['category'], test_size=0.2, random_state=42)


In [7]:
# train a machine learning model using Naive Bayes
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train, y_train)


MultinomialNB()

In [8]:
# evaluate the performance of the model
y_pred = naive_bayes.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')


In [9]:
df.tail()

Unnamed: 0,category,text
2220,business,car pull us retail figur us retail sale fell 0...
2221,politics,kilroy unveil immigr polici ex-chatshow host r...
2222,entertainment,rem announc new glasgow concert us band rem an...
2223,politics,polit squabbl snowbal becom commonplac argu bl...
2224,sport,souness delight euro progress boss graem soune...


In [10]:
# create a GUI to allow user input
root = Tk()

def predict_category():
    text = entry.get()
    text = preprocess(text)
    X = count_vectorizer.transform([text])
    prediction = naive_bayes.predict(X)
    label.config(text="Category: {}".format(prediction[0]))

label1 = Label(root, text="BBC News Text Classification")
label1.pack()

entry = Entry(root)
entry.pack()

button = Button(root, text="Predict", command=predict_category)
button.pack()

label = Label(root, text="")
label.pack()

root.mainloop()

print('Accuracy: {:.2f}%'.format(accuracy*100))
print('Precision: {:.2f}%'.format(precision*100))
print('Recall: {:.2f}%'.format(recall*100))
print('F1 Score: {:.2f}%'.format(f1*100))

Accuracy: 96.63%
Precision: 96.77%
Recall: 96.63%
F1 Score: 96.62%
