In [12]:
import pandas as pd
import json

import requests
from bs4 import BeautifulSoup

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

# <a id='toc1_'></a>[Preprocessing Utils](#toc0_)

In [13]:
def tokenizer(statement):
    # Filter out stop words & special characters
    stop_words = set(stopwords.words('english'))
    special_characters = '''!()-—[]{};:'"\, <>./?@#$%^&*_~+='''
    tokens = word_tokenize(statement)
    return [token.lower() for token in tokens if token.lower() not in stop_words and token not in special_characters]

def lemmatize_word(word):
    wordnet_tags = {"V": wordnet.VERB, "R": wordnet.ADV,"N": wordnet.NOUN,"J": wordnet.ADJ} 
    # Get parts of speech tag & determine the class in wordnet
    pos_tag = nltk.pos_tag([word])[0][1][0].upper()
    pos_tag_class = wordnet_tags.get(pos_tag, wordnet.NOUN)
    lemmatizer = WordNetLemmatizer()
    # Lemmaitze with Part of Speech Tag to get the pure word
    lemma = lemmatizer.lemmatize(word, pos=pos_tag_class)
    return lemma

def lemmatize_stmt(statement):    
    filtered_tokens = tokenizer(statement)
    lemmatize_tokens = []
    for word in filtered_tokens:
        lemma = lemmatize_word(word)
        lemmatize_tokens.append(lemma)
    return ' '.join(lemmatize_tokens)

# <a id='toc3_'></a>[Search Engine Practical](#toc0_)

## <a id='toc3_1_'></a>[Document Collection](#toc0_)

In [14]:
BASE_URL = 'https://pureportal.coventry.ac.uk/'

print("Scraping")

research_output = []
url_path = "https://pureportal.coventry.ac.uk/en/organisations/ihw-centre-for-health-and-life-sciences-chls/publications/"
response = requests.get(url_path)
soup = BeautifulSoup(response.content, "html.parser")
li_item_tags = soup.find_all('li', class_= 'list-result-item')
for li_item in li_item_tags:
    research_link = li_item.find('a')['href']
    authors = [{'author':author.text, 'url': author['href']} for author in li_item.findAll('a', 'link person')]
    published_date = li_item.find('span', class_='date').text
    title = li_item.find('h3', class_='title').text
    categories = [concept.text for concept in li_item.findAll('span', class_ = 'concept')]
    imp = lemmatize_stmt(title)  + ' ' + \
        " ".join([author['url'].split('/')[-1].replace('-', ' ') for author in authors]) + ' ' + \
            lemmatize_stmt(" ".join(categories)) + ' ' + \
        lemmatize_stmt(published_date)
    research_output.append({"authors":authors, "published_date":published_date, "title":title, "research_link": research_link, "categories": categories, "imp": imp})
print('\nTotal Document Scrapped:', len(research_output))
# Dump scraped data to json file
with open('./scraped_data/rcih_research_output.json', 'w') as f:
    json.dump(research_output, f, indent=4)

Scraping

Total Document Scrapped: 40


## <a id='toc3_2_'></a>[Inverted Index Construction](#toc0_)

In [15]:
inverted_index = {}
with open('./scraped_data/rcih_research_output.json') as f:
    rcih_research = json.loads(f.read())

for doc_id, doc in enumerate(rcih_research):
    imp = lemmatize_stmt(doc['title']  + ' ' + " ".join(doc['categories']) + ' ' + doc['published_date']) + ' ' + \
            " ".join([author['url'].split('/')[-1].replace('-', ' ') for author in doc['authors']])
    for term in imp.split():
        if term not in inverted_index:
            inverted_index[term] = set()
        inverted_index[term].add(doc_id)

# Converting Set to List
for key in inverted_index:
    inverted_index[key] = list(inverted_index[key])

with open('./scraped_data/inverted_index.json', 'w') as f:
    json.dump(inverted_index, f, indent=4)

# <a id='toc4_'></a>[Text Classifier](#toc0_)
Identify whether the input scientific document is from the listed cases: 
- Health
- Business
- Sport

Training Data link : https://www.kaggle.com/datasets/rmisra/news-category-dataset

## <a id='toc4_1_'></a>[Get Training Data](#toc0_)

In [16]:
news_df = pd.read_json('News_Category_Dataset_v3.json', lines=True)

In [17]:
training_news_df = news_df[news_df['category'].isin(['BUSINESS', "SPORTS", "HEALTHY LIVING"])]
training_news_df.loc[training_news_df['category']== "HEALTHY LIVING" , 'category'] = "HEALTH"
training_news_df = training_news_df[['short_description', 'category']]
training_news_df['short_description'] = training_news_df['short_description'].apply(lemmatize_stmt)
training_news_df.to_pickle("training_news_df.pkl")

## <a id='toc4_2_'></a>[MultinomialNB classifier](#toc0_)

In [18]:
training_news_df = pd.read_pickle('training_news_df.pkl')

# Sample training data
texts = training_news_df['short_description'].tolist()
labels = training_news_df['category'].tolist()
# Create feature vectors
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(texts)
classifier = MultinomialNB()

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

# Train the classifier
classifier.fit(X_train, y_train)

test_texts = ["""
They fell short in what was virtually a must-win game, with the added significance of being on free-to-air television and with the opportunity to make their mark on the eager crowd that roared and applauded every England run and wicket with vigour.

And they felt the disappointment, too - but there was a bigger picture.

Ten years ago to the day, Nat Sciver-Brunt made her England debut against Pakistan at Louth, where spectators were few and far between and most likely consisted mostly of the players' families.

But for England's debutant at Edgbaston, all-rounder Danielle Gibson, the experience could not be more of a contrast - and in the most brilliant, inspiring way.

Gibson was clapped in to bowl like an Olympic long jumper at the start of their mark, each dot ball cheered like a wicket and every run saved greeted with raucous appreciation.

There was diversity in numbers, too - children danced for the camera, groups of friends dressed up as Super Mario and lifeguards, boys donning England shirts with 'Knight' and 'Sciver-Brunt' on the back.

As England fought back in the closing overs with three late wickets to ignite hopes of a shock victory, the crowd savoured every emotion with them.
"""]
test_X = vectorizer.transform(test_texts)
test_predictions = classifier.predict(test_X)
print('Predictions:', test_predictions)

Predictions: ['SPORTS']


## <a id='toc4_3_'></a>[Metric Report](#toc0_)

Evaluate the performance of classification models.

In [19]:
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, classification_report

# Test the classifier on training data
train_predictions = classifier.predict(X_train)
train_accuracy = accuracy_score(y_train, train_predictions)
print('Training Accuracy:', train_accuracy)

# Test the classifier on testing data
test_predictions = classifier.predict(X_test)
test_accuracy = accuracy_score(y_test, test_predictions)
print('Testing Accuracy:', test_accuracy)

# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)

# Calculate the F1 score
f1 = f1_score(y_test, test_predictions, average='weighted')
print('F1 Score:', f1)

# Print classification report
class_report = classification_report(y_test, test_predictions)
print('\nClassification Report:')
print(class_report)

Training Accuracy: 0.802603800140746
Testing Accuracy: 0.7261469180973825
F1 Score: 0.7249536376495858

Classification Report:
              precision    recall  f1-score   support

    BUSINESS       0.80      0.66      0.72      1197
      HEALTH       0.64      0.86      0.74      1350
      SPORTS       0.84      0.62      0.71      1006

    accuracy                           0.73      3553
   macro avg       0.76      0.71      0.72      3553
weighted avg       0.75      0.73      0.72      3553

