### Importing the necessary libraries

In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
import string

### Importing the data files

In [12]:
article_info = pd.read_csv("articles_info.csv")
additional_info = pd.read_csv("additional_info.csv")

### Inspecting the dataframes

In [13]:
article_info.head()

Unnamed: 0,category,sub_categories,title,link,date,content,tags
0,Bacteria,"bacteria, public-health",Understanding Cholera: A brief look into its c...,https://foodmicrobiology.academy/understanding...,"July 13, 2024",IntroductionCholera is a bacterial infection t...,"cholera, food microbiology, Food safety, foodb..."
1,Bacteria,"bacteria, food-quality, fungi, yeast",From HPP Innovation Week – Part 2,https://foodmicrobiology.academy/from-hpp-inno...,"July 5, 2024",This is the second of a two-part series of ove...,"food industry, food manufacturing, food microb..."
2,Bacteria,"bacteria, food-quality",From HPP Innovation week – Part 1,https://foodmicrobiology.academy/from-hpp-inno...,"June 30, 2024",Hiperbaric îs a global leader in commercial hi...,"food manufacturing, food microbiology, food pr..."
3,Bacteria,"bacteria, public-health",Coliforms and their role in ensuring the safet...,https://foodmicrobiology.academy/coliforms-and...,"June 27, 2024",We are so delighted to welcome Ruby Chin to ou...,"foodborne disease, microbiology, water quality"
4,Bacteria,"bacteria, public-health",Diverse burden of foodborne disease,https://foodmicrobiology.academy/diverse-burde...,"May 26, 2024","Foodborne diseases, often referred to as foodb...","food microbiology, Food safety, food science, ..."


In [14]:
additional_info.head()

Unnamed: 0,title,content
0,About us,"['Fresher Healthier Safer.', 'This is at the c..."
1,Consulting,['We offer specialised food preservation consu...
2,Contact us,['Contact Info:\nContact Dr Philip Button dire...
3,Food spoilage yeast reference list,"['References', '1999,Critical Controls for Jui..."
4,Our shop,['Product: Science and business of food entrep...


### Recommending 5 articles based on the user query

In [15]:
# Download stopwords if not downloaded
nltk.download('stopwords')

# Function to preprocess text (remove stopwords, punctuation, etc.)
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    text = text.lower()  # Convert to lowercase
    text = ''.join([char for char in text if char not in string.punctuation])  # Remove punctuation
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

# Function to combine content and tags for better matching
def combine_content_and_tags(row):
    # Check if 'tags' is not NaN and is a string
    if isinstance(row['tags'], str):
        combined_tags = ' '.join(row['tags'].split(','))
    else:
        combined_tags = ''  # If tags are missing or not valid, use an empty string

    # Ensure 'content' is handled similarly
    if isinstance(row['content'], str):
        content = row['content']
    else:
        content = ''  # If content is missing, use an empty string

    # Combine content and tags
    return content + ' ' + combined_tags

# Main function to suggest articles based on query
def suggest_articles(df, query, text_column='content', tags_column='tags', num_articles=5):
    # Combine content and tags for each article
    df['combined_text'] = df.apply(combine_content_and_tags, axis=1)

    # Preprocess article text and query
    df['processed_text'] = df['combined_text'].apply(preprocess_text)
    processed_query = preprocess_text(query)

    # Initialize TF-IDF Vectorizer
    tfidf = TfidfVectorizer()

    # Fit and transform the article text
    tfidf_matrix = tfidf.fit_transform(df['processed_text'])

    # Transform the query into TF-IDF vector
    query_vector = tfidf.transform([processed_query])

    # Compute cosine similarity between the query and all articles
    cosine_sim = cosine_similarity(query_vector, tfidf_matrix).flatten()

    # Add cosine similarity to the dataframe
    df['similarity'] = cosine_sim

    # Sort articles by similarity score and select top n
    top_articles = df.sort_values(by='similarity', ascending=False).head(num_articles)

    # Return the top articles (you can return only necessary columns like title, link, and similarity score)
    return top_articles[['title', 'link', 'similarity', text_column]]

[nltk_data] Downloading package stopwords to /Users/karsh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
# Example query
query = 'foodborne diseases and microbiology'

# Assuming `article_info` is your dataframe loaded as seen in the screenshot
# Suggest articles based on the query
top_articles = suggest_articles(article_info, query)

In [18]:
top_articles

Unnamed: 0,title,link,similarity,content
307,Diverse burden of foodborne disease,https://foodmicrobiology.academy/diverse-burde...,0.373602,"Foodborne diseases, often referred to as foodb..."
4,Diverse burden of foodborne disease,https://foodmicrobiology.academy/diverse-burde...,0.373602,"Foodborne diseases, often referred to as foodb..."
12,Monoclonal antibody therapy in foodborne disea...,https://foodmicrobiology.academy/monoclonal-an...,0.259183,IntroductionFoodborne diseases are a significa...
311,Economic burden of foodborne disease in Austra...,https://foodmicrobiology.academy/economic-burd...,0.225066,Australia grapples with a substantial economic...
354,Zoonoses and foodborne disease,https://foodmicrobiology.academy/zoonoses-and-...,0.136829,The current 2019-nCoV (example image in Figure...
