In [2]:
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import english_words

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ce\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [4]:
def preprocess_text(text):
    ps = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in stop_words])
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

In [5]:
train_df['Title'] = train_df['Title'].apply(preprocess_text)
train_df['Description'] = train_df['Description'].apply(preprocess_text)

test_df['Title'] = test_df['Title'].apply(preprocess_text)
test_df['Description'] = test_df['Description'].apply(preprocess_text)

In [6]:
cv = CountVectorizer()
X_train = cv.fit_transform(train_df['Title'] + ' ' + train_df['Description'])
y_train = train_df['Class Index']
X_test = cv.transform(test_df['Title'] + ' ' + test_df['Description'])
y_test = test_df['Class Index']

In [7]:
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

In [8]:
y_pred = nb_classifier.predict(X_test)
print(classification_report(y_pred=y_pred, y_true=y_test))

              precision    recall  f1-score   support

           1       0.91      0.90      0.90      1900
           2       0.95      0.98      0.97      1900
           3       0.87      0.85      0.86      1900
           4       0.87      0.88      0.88      1900

    accuracy                           0.90      7600
   macro avg       0.90      0.90      0.90      7600
weighted avg       0.90      0.90      0.90      7600



In [9]:
def predict(title, description):
    if title == '' or description == '':
        return 'invalid input'
    else:
        ps = PorterStemmer()
        flag = False
        user_title = title
        user_description = description
        user_description = user_description.lower()
        user_description = user_description.split()
        
        #ensure that the user_title and user_description strings only contain valid English words
        user_title = user_title.lower()
        user_title = user_title.split()
        stemmedTitle = ''
        for word in user_title:
            if word in english_words.get_english_words_set(['web2']):
                stemmedTitle = stemmedTitle + ' ' + word
            else:
                word = ps.stem(word)
                if word in english_words.get_english_words_set(['web2']):
                    stemmedTitle = stemmedTitle + ' ' + word
                else:
                    stemmedTitle = stemmedTitle + ''
        user_title = stemmedTitle+" "
        
        stemmedDescription = ''
        for word in user_description:
            if word in english_words.get_english_words_set(['web2']):
                stemmedDescription = stemmedDescription + ' ' + word
            else:
                word = ps.stem(word)
                if word in english_words.get_english_words_set(['web2']):
                    stemmedDescription = stemmedDescription + ' ' + word
                else:
                    stemmedDescription = stemmedDescription + '' 
        user_description = stemmedDescription+" "
        
        
        stemmed_title = ps.stem(user_title)  # get the stemmed version of the user_title string
        title_words = list(filter(None, stemmed_title.split()))  # split the stemmed string into a list of words and remove any empty strings
        valid_title_words = [token in english_words.get_english_words_set(['web2']) for token in title_words]  # check if each word in the list is a valid English word from the web2 dictionary
        if all(valid_title_words):  # check if all the words in the list are valid English words
            # if all the words in the user_title string are valid English words, set the user_title variable and flag to True
            user_title = user_title
            flag = True
        else:
            # if not, set the user_title variable and flag to False and return 'invalid title'
            user_title = ''
            flag = False
            return 'invalid title'
        stemmed_description = ps.stem(user_description)  # stem the user_description string
        description_words = list(filter(None, stemmed_description.split()))  # split the stemmed string into a list of words and remove any empty strings
        valid_description_words = [token in english_words.get_english_words_set(['web2']) for token in description_words]  # check if each word in the list is a valid English word from the web2 dictionary
        if all(valid_description_words):  # check if all the words in the list are valid English words
            # if all the words in the user_description string are valid English words, set the user_description variable and flag to True
            user_description = user_description
            flag = True
        else:
            # if not, set the user_description variable and flag to False and return 'invalid description'
            user_description = ''
            flag = False
            return 'invalid description'
        if flag == True:
            user_article = {'title': str(user_title), 'description': str(user_description)}
            user_article = pd.DataFrame(user_article, index=[0])
            user_article['title'] = user_article['title'].apply(preprocess_text)
            user_article['description'] = user_article['description'].apply(preprocess_text)
            user_input = cv.transform(user_article['title'] + ' ' + user_article['description'])
            y_pred = nb_classifier.predict(user_input.toarray())
            if y_pred == 1:
                return 'World and Politics'
            elif y_pred == 2:
                return 'Sports'
            elif y_pred == 3:
                return 'Business'
            elif y_pred == 4:
                return 'Technology'
            else:
                return 'Unknown Topic'

In [10]:
#simple fastapi server to run the model
import nest_asyncio
import uvicorn
from fastapi import FastAPI
import pydantic
from fastapi.middleware.cors import CORSMiddleware

app = FastAPI()

class ArticleRequest(pydantic.BaseModel):
    title: str
    description: str

# origins = ["http://localhost:5500"]

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

@app.post('/article')
async def article(items: ArticleRequest):
    return {'topic': predict(items.title, items.description)}


if __name__ == "__main__":
    nest_asyncio.apply()
    uvicorn.run(app)

INFO:     Started server process [1200]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)


INFO:     127.0.0.1:51385 - "OPTIONS /article HTTP/1.1" 200 OK
INFO:     127.0.0.1:51385 - "POST /article HTTP/1.1" 200 OK
