Algorithm : Word2vector with Consine Similarity

In [3]:
import pandas as pd
import spacy
import gensim.downloader as api
import numpy as np


# Load the cleaned data
data = pd.read_csv('clean_article_210324.csv')

# Load Spacy model
nlp = spacy.load('en_core_web_sm')

# Load Word2Vec model
word_vectors = api.load("word2vec-google-news-300")

# Tokenize, lemmatize, and remove stop words
def preprocess_text(text):
    if isinstance(text, str):
        doc = nlp(text)
        tokens = [token.lemma_ for token in doc if not token.is_stop]
        processed_text = ' '.join(tokens)
        return processed_text
    else:
        return ""


# Convert tokens to word vectors and aggregate
def tokens_to_vectors(tokens):
    vectors = []
    for token in tokens:
        if token in word_vectors.key_to_index:
            vectors.append(word_vectors[token])
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(word_vectors.vector_size)
# Apply preprocessing to introduction, title, and body columns and combine them
data['combined_processed'] = data['intro'].apply(preprocess_text).apply(lambda x: ' '.join(x)) + ' ' + \
                             data['title'].apply(preprocess_text).apply(lambda x: ' '.join(x)) + ' ' + \
                             data['body'].apply(preprocess_text).apply(lambda x: ' '.join(x))
# Apply tokenization and vectorization
data['tokens'] = data['combined_processed'].apply(preprocess_text)
data['vectors'] = data['tokens'].apply(tokens_to_vectors)


# cosine_sim = cosine_similarity(vectors, vectors)
# # Check if there is a matching row in the dataset
# input_text =  "Annual HEINEKEN Sustainathon Invites University Students to Innovate for a Circular Cambodia"
# row = data[data['title'] == input_text]
# if len(row) > 0:
#     # Concatenate the title, intro, and body of the row
#     input_processed = preprocess_text(row['title'].values[0] + ' ' + row['intro'].values[0] + ' ' + row['body'].values[0])
# else:
#     # Preprocess the input text directly
#     input_processed = preprocess_text(input_text)
# print(input_processed)

In [11]:
from sklearn.metrics.pairwise import cosine_similarity
# Calculate cosine similarity matrix
vectors = np.vstack(data['vectors'].to_numpy())
# Calculate cosine similarity matrix
cosine_sim = cosine_similarity(data['vectors'].tolist(), data['vectors'].tolist())
from collections import Counter
def recommend_articles(input_text, num_recommendations=6):
    # Preprocess the input text directly
    row = data[data['title'] == input_text]
    if len(row) > 0:
        # Concatenate the title, intro, and body of the row
        input_processed = preprocess_text(row['title'].values[0] + ' ' + row['intro'].values[0] + ' ' + row['body'].values[0])
    else:
        # Preprocess the input text directly
        input_processed = preprocess_text(input_text)
    
    # Vectorize the input text
    input_vectorized = tokens_to_vectors(input_processed)
    # Count occurrences of each word
    # word_counts = Counter(input_processed)
    # # Sort word counts in descending order
    # sorted_word_counts = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
    # # Extract top 20 words and their counts
    # top_words = [word for word, _ in sorted_word_counts[:20]]
    # # Vectorize the input text based on the occurrences of top words
    # # input_vectorized = np.array([word_counts[word] if word in word_counts else 0 for word in top_words])
    # print(sorted_word_counts)
    # # Display word counts
    # for word, count in sorted_word_counts[:20]:
    #     print(f"{word}: {count}")
    
    # Calculate cosine similarity with all articles
    similarity_scores = cosine_similarity([input_vectorized], data['vectors'].tolist())[0]
    
    # Sort the indices based on similarity scores
    sim_indices = similarity_scores.argsort()[::-1]
    
    # Get the recommended articles based on similarity
    recommended_articles = data.iloc[sim_indices[:num_recommendations]].copy()  # Make a copy to avoid modifying the original DataFrame
    
    # Add similarity scores to recommended_articles
    recommended_articles['similarity_scores'] = similarity_scores[sim_indices[:num_recommendations]]
    
    # Set display option for unlimited column width
    pd.set_option('display.max_colwidth', None)
    print(input_processed)
    # Return the recommended articles
    return recommended_articles[['page_ptr_id', 'title', 'published_at', 'similarity_scores', 'intro']]

# Example usage
# Example usage
title_1 = "Annual HEINEKEN Sustainathon Invites University Students to Innovate for a Circular Cambodia"
title_2 = "For Social Platforms, the Outage Was Short. But People’s Stories Vanished, and That’s No Small Thing"
title_3 = "Taylor Swift Struck a Deal With Singapore Not to Perform in Any Other Southeast Asian Country"
title_4 = "UK University Students to Empower Cambodian Girls with STEM Skills"
title_5 = "Court Detains Lamborghini Driver Behind Fatal Collision"
title_6 = "AirAsia Cambodia: Enhance Intra-ASEAN Travel, Cambodian Income With Relaxed Visa Fees"
input_title = title_6
recommended_articles = recommend_articles(input_title)
print("Recommended Articles:")
recommended_articles

AirAsia Cambodia : Enhance Intra - ASEAN Travel , Cambodian Income Relaxed Visa fee
Recommended Articles:


Unnamed: 0,page_ptr_id,title,published_at,similarity_scores,intro
2312,1842,PropertyGuru Awards Celebrate Cambodias Real Estate Success,2023-08-21 01:25:50.259544 UTC,0.970795,Cambodias thriving real estate sector was thrown under the spotlight at this years PropertyGuru Cambodia Property Awards with OCIC Group and Siha Properties leading the list of winners
2697,555,ACLEDA Crowned SME Financier of the Year,2022-09-23 06:12:54.136 UTC,0.969854,ACLEDA Bank has scooped a second award at the Global SME Finance Forum 2022 for its work to help Cambodias SME sector grow walking away with the title of SME Financier of the Year
268,1373,Head of FFC Reverses Decision to Resign,2023-05-09 09:00:57.071387 UTC,0.969307,Sao Sokha has reversed his decision to resign as head of FFC after an appeal from the Prime Minister and Cambodian football fans
2543,551,ACLEDA Scoops Award for Aiding Cambodian SMEs,2022-09-22 09:31:03.88 UTC,0.969275,Leading Cambodian bank ACLEDA has won the Best Bank Partner for SMEs in EAP Award by the International Finance Corporation for its commitment to help the Kingdoms SME network grow
1553,708,Officials Arrive in Phnom Penh for ASEAN Summit,2022-11-07 09:30:25.775284 UTC,0.969122,Officials have started to land in Phnom Penh for the ASEAN Summit ahead of global leaders with Chinese Prime Minister Li Keqiang arriving tomorrow and US President Joe Biden on Saturday
264,1659,CrossBorder QR Payments Between China and Cambodia Launch,2023-07-11 09:44:47.441893 UTC,0.968479,NBC and UnionPay International have inked a deal to enable crossborder QR payments between Cambodia and China via KHQR and UnionPay


Algorithm : Word2vector with Euclidean distance

In [24]:
from sklearn.metrics.pairwise import euclidean_distances

def recommend_articles(input_text, num_recommendations=6):
    # Preprocess the input text directly
    row = data[data['title'] == input_text]
    if len(row) > 0:
        # Concatenate the title, intro, and body of the row
        input_processed = preprocess_text(row['title'].values[0] + ' ' + row['intro'].values[0] + ' ' + row['body'].values[0])
    else:
        # Preprocess the input text directly
        input_processed = preprocess_text(input_text)
    
    # Vectorize the input text
    input_vectorized = tokens_to_vectors(input_processed)
    
    # Calculate Euclidean distances with all articles
    distances = euclidean_distances([input_vectorized], data['vectors'].tolist())[0]
    
    # Sort the indices based on distances
    sorted_indices = distances.argsort()
    
    # Get the recommended articles based on distances
    recommended_articles = data.iloc[sorted_indices[:num_recommendations]].copy()  # Make a copy to avoid modifying the original DataFrame
    
    # Add distances to recommended_articles
    recommended_articles['distances'] = distances[sorted_indices[:num_recommendations]]
    
    # Set display option for unlimited column width
    pd.set_option('display.max_colwidth', None)
    
    # Return the recommended articles
    return recommended_articles[['page_ptr_id', 'title', 'published_at', 'distances', 'intro']]

# Example usage
title_1 = "Annual HEINEKEN Sustainathon Invites University Students to Innovate for a Circular Cambodia"
title_2 = "For Social Platforms, the Outage Was Short. But People’s Stories Vanished, and That’s No Small Thing"
title_3 = "Taylor Swift Struck a Deal With Singapore Not to Perform in Any Other Southeast Asian Country"
title_4 = "UK University Students to Empower Cambodian Girls with STEM Skills"
title_5 = "Court Detains Lamborghini Driver Behind Fatal Collision"
title_6 = "AirAsia Cambodia: Enhance Intra-ASEAN Travel, Cambodian Income With Relaxed Visa Fees"
input_title = title_6
recommended_articles = recommend_articles(input_title)
print("Recommended Articles:")
recommended_articles


Recommended Articles:


Unnamed: 0,page_ptr_id,title,published_at,distances,intro
2697,555,ACLEDA Crowned SME Financier of the Year,2022-09-23 06:12:54.136 UTC,0.393206,ACLEDA Bank has scooped a second award at the Global SME Finance Forum 2022 for its work to help Cambodias SME sector grow walking away with the title of SME Financier of the Year
2543,551,ACLEDA Scoops Award for Aiding Cambodian SMEs,2022-09-22 09:31:03.88 UTC,0.409243,Leading Cambodian bank ACLEDA has won the Best Bank Partner for SMEs in EAP Award by the International Finance Corporation for its commitment to help the Kingdoms SME network grow
2799,1954,Hun Manet Youngest Leader at ASEAN Summit,2023-09-05 10:08:42.858351 UTC,0.410797,Prime Minister Hun Manet is currently in Indonesia for the 43rd ASEAN Summit as the youngest leader from the 10 bloc nations replicating Hun Sens attendance in 1999
1450,265,ACLEDA Securities Most Traded Value on Cambodian Stock Exchange,2022-07-11 10:13:21.070457 UTC,0.416501,ACLEDA Securities PLC is celebrating after securing an award and has the most traded value on Cambodia Securities Exchange for Q2 2022
2918,1658,MJQE Net Profit Skyrockets to 23m in FY22,2023-07-11 09:29:58.004282 UTC,0.418062,Up 50 times education service provider MJQE recorded large gains in net earnings and revenue in the last fiscal year
264,1659,CrossBorder QR Payments Between China and Cambodia Launch,2023-07-11 09:44:47.441893 UTC,0.419445,NBC and UnionPay International have inked a deal to enable crossborder QR payments between Cambodia and China via KHQR and UnionPay


Algorithm: Word2vector with Jaccard similarity

In [18]:
import pandas as pd

# Preprocess text for Jaccard similarity
def preprocess_text_jaccard(text):
    tokens = set(text.split())  # Split text into tokens and convert to a set
    return tokens

# Recommend articles using Jaccard similarity
def recommend_articles_jaccard(input_text, num_recommendations=6):
    # Preprocess the input text directly
    input_processed = preprocess_text_jaccard(input_text)
    
    # Combine introduction, title, and body columns
    data['combined_processed'] = data['intro'] + ' ' + data['title'] + ' ' + data['body']
    
    # Preprocess all articles and convert them to sets of tokens
    articles_processed = data['combined_processed'].apply(preprocess_text_jaccard)
    
    # Calculate Jaccard similarity with all articles
    similarities = [len(input_processed.intersection(article)) / len(input_processed.union(article)) for article in articles_processed]
    
    # Create a DataFrame to store similarity scores and article indices
    similarity_df = pd.DataFrame({'index': range(len(similarities)), 'similarities': similarities})
    
    # Sort articles based on similarity scores
    sorted_indices = similarity_df.sort_values(by='similarities', ascending=False)['index'].tolist()
    
    # Get the recommended articles based on similarity
    recommended_articles = data.iloc[sorted_indices[:num_recommendations]].copy()  # Make a copy to avoid modifying the original DataFrame
    
    # Add similarity scores to recommended_articles
    recommended_articles['similarities'] = similarity_df['similarities'].iloc[sorted_indices[:num_recommendations]]
    
    # Set display option for unlimited column width
    pd.set_option('display.max_colwidth', None)
    
    # Return the recommended articles
    return recommended_articles[['page_ptr_id', 'title', 'published_at', 'similarities', 'intro']]

# Example usage
title_1 = "Annual HEINEKEN Sustainathon Invites University Students to Innovate for a Circular Cambodia"
title_2 = "For Social Platforms, the Outage Was Short. But People’s Stories Vanished, and That’s No Small Thing"
title_3 = "Taylor Swift Struck a Deal With Singapore Not to Perform in Any Other Southeast Asian Country"
title_4 = "UK University Students to Empower Cambodian Girls with STEM Skills"
title_5 = "Court Detains Lamborghini Driver Behind Fatal Collision"
title_6 = "AirAsia Cambodia: Enhance Intra-ASEAN Travel, Cambodian Income With Relaxed Visa Fees"
input_title = title_6
recommended_articles = recommend_articles(input_title)
print("Recommended Articles:")
recommended_articles


Recommended Articles:


Unnamed: 0,page_ptr_id,title,published_at,distances,intro
2697,555,ACLEDA Crowned SME Financier of the Year,2022-09-23 06:12:54.136 UTC,0.393206,ACLEDA Bank has scooped a second award at the Global SME Finance Forum 2022 for its work to help Cambodias SME sector grow walking away with the title of SME Financier of the Year
2543,551,ACLEDA Scoops Award for Aiding Cambodian SMEs,2022-09-22 09:31:03.88 UTC,0.409243,Leading Cambodian bank ACLEDA has won the Best Bank Partner for SMEs in EAP Award by the International Finance Corporation for its commitment to help the Kingdoms SME network grow
2799,1954,Hun Manet Youngest Leader at ASEAN Summit,2023-09-05 10:08:42.858351 UTC,0.410797,Prime Minister Hun Manet is currently in Indonesia for the 43rd ASEAN Summit as the youngest leader from the 10 bloc nations replicating Hun Sens attendance in 1999
1450,265,ACLEDA Securities Most Traded Value on Cambodian Stock Exchange,2022-07-11 10:13:21.070457 UTC,0.416501,ACLEDA Securities PLC is celebrating after securing an award and has the most traded value on Cambodia Securities Exchange for Q2 2022
2918,1658,MJQE Net Profit Skyrockets to 23m in FY22,2023-07-11 09:29:58.004282 UTC,0.418062,Up 50 times education service provider MJQE recorded large gains in net earnings and revenue in the last fiscal year
264,1659,CrossBorder QR Payments Between China and Cambodia Launch,2023-07-11 09:44:47.441893 UTC,0.419445,NBC and UnionPay International have inked a deal to enable crossborder QR payments between Cambodia and China via KHQR and UnionPay
