Algorithm : TF-IDF with Consine similarity

In [None]:
import pandas as pd
import spacy
import sys

# Load the cleaned data
data = pd.read_csv('clean_article_210324.csv')

# Load Spacy model
nlp = spacy.load('en_core_web_sm')

# Tokenize, lemmatize, and remove stop words
# def preprocess_text(text):
#     doc = nlp(text)
#     tokens = [token.lemma_ for token in doc if not token.is_stop]
#     return ' '.join(tokens)
def preprocess_text(text):
    if isinstance(text, str):
        doc = nlp(text)
        tokens = [token.lemma_ for token in doc if not token.is_stop]
        processed_text = ' '.join(tokens)
        return processed_text
    else:
        return ""
# Apply preprocessing to introduction, title, and body columns
data['intro_processed'] = data['intro'].apply(preprocess_text)
data['title_processed'] = data['title'].apply(preprocess_text)
data['body_processed'] = data['body'].apply(preprocess_text)

# Combine processed text columns
data['combined_processed'] = data['intro_processed'] + ' ' + data['title_processed'] + ' ' + data['body_processed']


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# Vectorize text using TF-IDF
vectorizer = TfidfVectorizer()
vectorizer.fit(data['combined_processed'])
X = vectorizer.transform(data['combined_processed'])
# X = vectorizer.fit_transform(data['combined_processed'])
# Calculate cosine similarity matrix
cosine_sim = cosine_similarity(X, X)

def recommend_articles(input_text, num_recommendations=5):
    # Preprocess the input text
    input_processed = preprocess_text(input_text)
    
    # Vectorize the input text
    input_vectorized = vectorizer.transform([input_processed])
    
    # Calculate cosine similarity with all articles
    similarity_scores = cosine_similarity(input_vectorized, X)
    
    # Sort the indices based on similarity scores
    sim_indices = similarity_scores.argsort()[0][::-1]
    
    # Get the recommended articles based on similarity, excluding the input article
    recommended_articles = data.iloc[sim_indices[1:num_recommendations+1]].copy()  
    
    # Add similarity scores to recommended_articles
    recommended_articles.loc[:, 'similarity_scores'] = similarity_scores[0][sim_indices[1:num_recommendations+1]]  
    
    # Set display option for unlimited column width
    pd.set_option('display.max_colwidth', None)
    
    # Return the recommended articles
    return recommended_articles[['page_ptr_id','title','published_at', 'similarity_scores', 'intro']]

# Example usage
input_title = "UK University Students to Empower Cambodian Girls with STEM Skills" 
recommended_articles = recommend_articles(input_title)
print("Recommended Articles:")
print(recommended_articles)


Algorithm : TF-IDF with Jaccard similarity

In [None]:

# Define Jaccard similarity function
def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0

# Calculate Jaccard similarity matrix
num_docs = len(data)
jaccard_sim = [[0] * num_docs for _ in range(num_docs)]  # Initialize similarity matrix
for i in range(num_docs):
    for j in range(i+1, num_docs):
        similarity = jaccard_similarity(set(data['combined_processed'][i].split()), set(data['combined_processed'][j].split()))
        jaccard_sim[i][j] = similarity
        jaccard_sim[j][i] = similarity  # Since the matrix is symmetric

# Example usage
doc_index = 0  # Index of the document for which you want similar documents
similar_docs = [(i, jaccard_sim[doc_index][i]) for i in range(num_docs)]  # Similarity scores for all documents
similar_docs.sort(key=lambda x: x[1], reverse=True)  # Sort by similarity score
print("Top 5 similar documents:")
for doc_index, similarity_score in similar_docs[:5]:
    print(f"Document {doc_index}: Similarity Score = {similarity_score}")


In [None]:
# Define Jaccard similarity function
def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0
def recommend_articles(input_text, num_recommendations=6):
    # Find the row where the title matches the input_text
    row = data[data['title'] == input_text]

    # Check if there is a matching row in the dataset
    if len(row) > 0:
        # Concatenate the title, intro, and body of the row
        input_processed = preprocess_text(row['title'].values[0] + ' ' + row['intro'].values[0] + ' ' + row['body'].values[0])
    else:
        # Preprocess the input text directly
        input_processed = preprocess_text(input_text)
    
    # Calculate Jaccard similarity with all articles
    similarity_scores = [jaccard_similarity(set(input_processed.split()), set(article.split())) for article in data['combined_processed']]
    
    # Sort the indices based on similarity scores
    sim_indices = sorted(range(len(similarity_scores)), key=lambda i: similarity_scores[i], reverse=True)[:num_recommendations]
    
    # Get the recommended articles based on similarity
    recommended_articles = data.iloc[sim_indices].copy()  # Make a copy to avoid modifying the original DataFrame
    
    # Add similarity scores to recommended_articles
    recommended_articles.loc[:, 'similarity_scores'] = [similarity_scores[i] for i in sim_indices]  # Using .loc to set values
    
    # Set display option for unlimited column width
    pd.set_option('display.max_colwidth', None)
    
    # Return the recommended articles
    return recommended_articles[['page_ptr_id','title','published_at', 'similarity_scores', 'intro']]

# Example usage
# Example usage
title_1 = "Annual HEINEKEN Sustainathon Invites University Students to Innovate for a Circular Cambodia"
title_2 = "For Social Platforms, the Outage Was Short. But People’s Stories Vanished, and That’s No Small Thing"
title_3 = "Taylor Swift Struck a Deal With Singapore Not to Perform in Any Other Southeast Asian Country"
title_4 = "UK University Students to Empower Cambodian Girls with STEM Skills"
title_5 = "Court Detains Lamborghini Driver Behind Fatal Collision"
title_6 = "AirAsia Cambodia: Enhance Intra-ASEAN Travel, Cambodian Income With Relaxed Visa Fees"
input_title = title_6
recommended_articles = recommend_articles(input_title)
print("Recommended Articles:")
recommended_articles


Algorithm : TF-IDF with Euclidean_distance

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances

# Define TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the combined processed text data
tfidf_matrix = tfidf_vectorizer.fit_transform(data['combined_processed'])

# Adjust the recommend_articles function to use Euclidean distance
def recommend_articles(input_text, num_recommendations=6):
    # Preprocess the input text
    input_processed = preprocess_text(input_text)
    
    # Transform the input text using the TF-IDF vectorizer
    input_vectorized = tfidf_vectorizer.transform([input_processed])
    
    # # Calculate Euclidean distance with all articles
    # distances = [np.linalg.norm(input_vectorized - article) for article in tfidf_matrix]
    # Calculate Euclidean distance with all articles
    distances = [euclidean_distances(input_vectorized, doc) for doc in tfidf_matrix]

    # Compute similarity scores based on distances (subtract from 1)
    similarity_scores = [1 - distance for distance in distances]
    
    # Sort the indices based on distance
    sorted_indices = sorted(range(len(distances)), key=lambda i: distances[i])
    
    # Get the recommended articles based on distance
    recommended_articles = data.iloc[sorted_indices[:num_recommendations]].copy()  # Make a copy to avoid modifying the original DataFrame
    
    # Add distances to recommended_articles
    recommended_articles.loc[:, 'distances'] = [distances[i] for i in sorted_indices[:num_recommendations]]  # Using .loc to set values
    
    # Set display option for unlimited column width
    pd.set_option('display.max_colwidth', None)
    
    # Return the recommended articles
    return recommended_articles[['page_ptr_id','title','published_at', 'distances', 'intro']]

# Example usage
title_1 = "Annual HEINEKEN Sustainathon Invites University Students to Innovate for a Circular Cambodia"
title_2 = "For Social Platforms, the Outage Was Short. But People’s Stories Vanished, and That’s No Small Thing"
title_3 = "Taylor Swift Struck a Deal With Singapore Not to Perform in Any Other Southeast Asian Country"
title_4 = "UK University Students to Empower Cambodian Girls with STEM Skills"
title_5 = "Court Detains Lamborghini Driver Behind Fatal Collision"
title_6 = "AirAsia Cambodia: Enhance Intra-ASEAN Travel, Cambodian Income With Relaxed Visa Fees"
input_title = title_6
recommended_articles = recommend_articles(input_title)
print("Recommended Articles:")
recommended_articles
