### Importing necessary libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
import string
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


### Importing the data files

In [2]:
data = pd.read_csv('articles_info.csv')
additional_data = pd.read_csv('additional_info.csv')

### Data Cleaning

In [3]:
#remove duplicate rows
df = data.drop_duplicates(subset="content", keep='first')

In [4]:
df.describe()

Unnamed: 0,category,sub_categories,title,link,date,content,tags
count,158,158,158,158,158,158,138
unique,17,69,158,158,158,158,129
top,Bacteria,bacteria,Understanding Cholera: A brief look into its c...,https://foodmicrobiology.academy/understanding...,"July 13, 2024",IntroductionCholera is a bacterial infection t...,"food microbiology, food regulations, Food safe..."
freq,88,18,1,1,1,1,4


In [5]:
def preprocess_text(df, column):
    # Convert text to lowercase
    df[column] = df[column].str.lower()

    # Remove punctuation
    punctuations_list = string.punctuation
    translator = str.maketrans('', '', punctuations_list)
    df[column] = df[column].apply(lambda text: text.translate(translator))

    # Remove repeating characters
    df[column] = df[column].apply(lambda text: re.sub(r'(.)\1+', r'\1', text))

    # Remove numbers
    df[column] = df[column].apply(lambda text: re.sub('[0-9]+', '', text))

    # Tokenize text
    tokenizer = RegexpTokenizer(r'\w+')
    df[column] = df[column].apply(lambda text: tokenizer.tokenize(text))

    # Lemmatization
    lm = WordNetLemmatizer()
    df[column] = df[column].apply(lambda words: [lm.lemmatize(word) for word in words])

    # Convert list of words back to string
    df[column] = df[column].apply(lambda words: ' '.join(words))

    # Remove stopwords (moved to the end)
    STOPWORDS = set(stopwords.words('english'))
    df[column] = df[column].apply(lambda text: " ".join([word for word in text.split() if word not in STOPWORDS]))

    # Return the modified DataFrame
    return df

In [6]:
df = preprocess_text(df, 'content')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].apply(lambda text: text.translate(translator))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].apply(lambda text: re.sub(r'(.)\1+', r'\1', text))
A value is trying to be

In [7]:
df.head()

Unnamed: 0,category,sub_categories,title,link,date,content,tags
0,Bacteria,"bacteria, public-health",Understanding Cholera: A brief look into its c...,https://foodmicrobiology.academy/understanding...,"July 13, 2024",introductioncholera bacterial infection caused...,"cholera, food microbiology, Food safety, foodb..."
1,Bacteria,"bacteria, food-quality, fungi, yeast",From HPP Innovation Week – Part 2,https://foodmicrobiology.academy/from-hpp-inno...,"July 5, 2024",second twopart series overview blog article wr...,"food industry, food manufacturing, food microb..."
2,Bacteria,"bacteria, food-quality",From HPP Innovation week – Part 1,https://foodmicrobiology.academy/from-hpp-inno...,"June 30, 2024",hiperbaric îs global leader comercial high pre...,"food manufacturing, food microbiology, food pr..."
3,Bacteria,"bacteria, public-health",Coliforms and their role in ensuring the safet...,https://foodmicrobiology.academy/coliforms-and...,"June 27, 2024",delighted welcome ruby chin team ruby ndyear b...,"foodborne disease, microbiology, water quality"
4,Bacteria,"bacteria, public-health",Diverse burden of foodborne disease,https://foodmicrobiology.academy/diverse-burde...,"May 26, 2024",fodborne disease often refered fodborne ilnese...,"food microbiology, Food safety, food science, ..."


### Integrating the BioBERT model

In [8]:
# Load the pre-trained BioBERT model
model_name = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Create a question-answering pipeline using the model and tokenizer
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [9]:
# Vectorize the content of the articles
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['content'])

In [10]:
def recommend_articles(query):
    # Vectorize the query
    query_vec = tfidf_vectorizer.transform([query])
    
    # Compute cosine similarity between the query and each article
    cosine_similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
    
    # Get top article indices
    top_indices = cosine_similarities.argsort()[-3:][::-1]  # Top 3 recommendations
    
    # Return top articles
    return df.iloc[top_indices]

In [11]:
def handle_user_input(user_input, context=None):
    if context:
        # Use BioBERT for question answering
        result = qa_pipeline(question=user_input, context=context)
        answer = result['answer']
        return {"type": "answer", "content": answer}
    else:
        # Recommend articles based on user input
        recommended = recommend_articles(user_input)
        return {"type": "recommendation", "content": recommended}

In [12]:
# Define a sample context for question answering
context = "Genetics is the study of genes and their role in inheritance, where genes are the basic units of heredity in living organisms."

# Testing BioBERT question answering
user_question = "What is the study of genetics?"
response = handle_user_input(user_question, context)
print("BioBERT Response:", response)

# Testing article recommendation
user_query = "Tell me about biomedical research."
response = handle_user_input(user_query)
print("Article Recommendation Response:", response)

BioBERT Response: {'type': 'answer', 'content': 'their role in inheritance, where genes are the'}
Article Recommendation Response: {'type': 'recommendation', 'content':       category                                     sub_categories  \
104  Commentry                                          commentry   
103  Commentry                                          commentry   
75    Bacteria  bacteria, eukaryotic-microbiology, fermented-f...   

                                                title  \
104  Traditional scientific academic research funding   
103                               Our first two years   
75              Career mentoring in food microbiology   

                                                  link             date  \
104  https://foodmicrobiology.academy/traditional-s...    March 6, 2021   
103  https://foodmicrobiology.academy/our-first-two...  January 1, 2022   
75   https://foodmicrobiology.academy/career-mentor...  August 26, 2020   

                        