Algorithm : Bag of word with Consine Similarity

In [2]:
import pandas as pd
import spacy


# Load the cleaned data
data = pd.read_csv('clean_article_210324.csv')

# Load Spacy model
nlp = spacy.load('en_core_web_sm')

# Tokenize, lemmatize, and remove stop words
def preprocess_text(text):
    if isinstance(text, str):
        doc = nlp(text)
        tokens = [token.lemma_ for token in doc if not token.is_stop]
        processed_text = ' '.join(tokens)
        return processed_text
    else:
        return ""

# Apply preprocessing to introduction, title, and body columns
data['intro_processed'] = data['intro'].apply(preprocess_text)
data['title_processed'] = data['title'].apply(preprocess_text)
data['body_processed'] = data['body'].apply(preprocess_text)

# Combine processed text columns
data['combined_processed'] = data['intro_processed'] + ' ' + data['title_processed'] + ' ' + data['body_processed']




In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# Vectorize text using Bag of Words (BoW)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['combined_processed'])

# Calculate cosine similarity matrix
cosine_sim = cosine_similarity(X, X)
def recommend_articles(input_text, num_recommendations=6):
    # Find the row where the title matches the input_text
    row = data[data['title'] == input_text]

    # Check if there is a matching row in the dataset
    if len(row) > 0:
        # Concatenate the title, intro, and body of the row
        input_processed = preprocess_text(row['title'].values[0] + ' ' + row['intro'].values[0] + ' ' + row['body'].values[0])
    else:
        # Preprocess the input text directly
        input_processed = preprocess_text(input_text)
    # Vectorize the concatenated text
    input_vectorized = vectorizer.transform([input_processed])
    
    # Calculate cosine similarity with all articles
    similarity_scores = cosine_similarity(input_vectorized, X)
    
    # Sort the indices based on similarity scores
    sim_indices = similarity_scores.argsort()[0][::-1]
    
    # Get the recommended articles based on similarity
    recommended_articles = data.iloc[sim_indices[:num_recommendations]].copy()  # Make a copy to avoid modifying the original DataFrame
    
    # Add similarity scores to recommended_articles
    recommended_articles.loc[:, 'similarity_scores'] = similarity_scores[0][sim_indices[:num_recommendations]]  # Using .loc to set values
    print(input_processed)
    # Set display option for unlimited column width
    pd.set_option('display.max_colwidth', None)
    
    # Return the recommended articles
    return recommended_articles[['page_ptr_id','title','published_at', 'similarity_scores', 'intro']]

# Example usage
title_1 = "Annual HEINEKEN Sustainathon Invites University Students to Innovate for a Circular Cambodia"
title_2 = "For Social Platforms, the Outage Was Short. But People’s Stories Vanished, and That’s No Small Thing"
title_3 = "Taylor Swift Struck a Deal With Singapore Not to Perform in Any Other Southeast Asian Country"
title_4 = "UK University Students to Empower Cambodian Girls with STEM Skills"
title_5 = "Court Detains Lamborghini Driver Behind Fatal Collision"
title_6 = "AirAsia Cambodia: Enhance Intra-ASEAN Travel, Cambodian Income With Relaxed Visa Fees"
input_title = title_6
recommended_articles = recommend_articles(input_title)
print("Recommended Articles:")
recommended_articles


AirAsia Cambodia : Enhance Intra - ASEAN Travel , Cambodian Income Relaxed Visa fee
Recommended Articles:


Unnamed: 0,page_ptr_id,title,published_at,similarity_scores,intro
3045,3109,AirAsia Cambodia Enhance IntraASEAN Travel Cambodian Income With Relaxed Visa Fees,2024-02-26 02:56:49.245104 UTC,0.405096,In a comprehensive report by AirAsia Cambodia challenges and recommendations to improve the tourism sector have been put together which echoes longstanding issues raised by industry players
2897,853,AirAsia Cambodia to Take Off to Expand ASEAN Market Share,2022-12-09 10:41:02.752139 UTC,0.369897,Capital As fifth airline AirAsia Cambodia has numerous plans immediate ones being to strengthen connectivity to North Asia diversifying Cambodian routes and stimulating local air travel demand
1557,3258,AirAsia Cambodia Seeks Collaboration To Address High Aviation Costs,2024-03-19 00:32:03.1 UTC,0.271572,AirAsia Cambodia launches domestic ticket sales as CEO Nam Vissoth raises high aviation costs as a key concern urging collaboration with stakeholders to find solutions
2970,3256,AirAsia Cambodia To Take Off in May Amid Aviation Costs Setback,2024-03-18 01:47:40.509105 UTC,0.264002,The lowcost carriers Cambodian joint venture launches its maiden flight in May while calling for preferential rates to offset ticket fares
2678,1455,CP Cambodia Becomes Official Sponsor of ASEAN Para Games,2023-05-31 09:44:11.442 UTC,0.253024,Thailands agroindustrial and food conglomerate CP Cambodia has become an official sponsor of the 2023 ASEAN Para Games which will see about 2000 athletes from ASEAN land in Cambodia to compete from June 3 to 9
168,3003,Thailand Reduces ReEntry Visa Fees for Migrant Workers,2024-02-10 01:49:11.796142 UTC,0.252357,Thailand has announced plans to slash the visa fee to 13 for Cambodian workers reentering the nation after returning home to celebrate Khmer New Year


Algorithm : Word of word with Euclidean_distance

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances

# Define TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the combined processed text data
tfidf_matrix = tfidf_vectorizer.fit_transform(data['combined_processed'])

# Adjust the recommend_articles function to use Euclidean distance
def recommend_articles(input_text, num_recommendations=6):
    # Preprocess the input text
    input_processed = preprocess_text(input_text)
    
    # Transform the input text using the TF-IDF vectorizer
    input_vectorized = tfidf_vectorizer.transform([input_processed])
    
    # # Calculate Euclidean distance with all articles
    # distances = [np.linalg.norm(input_vectorized - article) for article in tfidf_matrix]
    # Calculate Euclidean distance with all articles
    distances = [euclidean_distances(input_vectorized, doc) for doc in tfidf_matrix]

    # Compute similarity scores based on distances (subtract from 1)
    similarity_scores = [1 - distance for distance in distances]
    
    # Sort the indices based on distance
    sorted_indices = sorted(range(len(distances)), key=lambda i: distances[i])
    
    # Get the recommended articles based on distance
    recommended_articles = data.iloc[sorted_indices[:num_recommendations]].copy()  # Make a copy to avoid modifying the original DataFrame
    
    # Add distances to recommended_articles
    recommended_articles.loc[:, 'distances'] = [distances[i] for i in sorted_indices[:num_recommendations]]  # Using .loc to set values
    
    # Set display option for unlimited column width
    pd.set_option('display.max_colwidth', None)
    
    # Return the recommended articles
    return recommended_articles[['page_ptr_id','title','published_at', 'distances', 'intro']]

# Example usage
title_1 = "Annual HEINEKEN Sustainathon Invites University Students to Innovate for a Circular Cambodia"
title_2 = "For Social Platforms, the Outage Was Short. But People’s Stories Vanished, and That’s No Small Thing"
title_3 = "Taylor Swift Struck a Deal With Singapore Not to Perform in Any Other Southeast Asian Country"
title_4 = "UK University Students to Empower Cambodian Girls with STEM Skills"
title_5 = "Court Detains Lamborghini Driver Behind Fatal Collision"
title_6 = "AirAsia Cambodia: Enhance Intra-ASEAN Travel, Cambodian Income With Relaxed Visa Fees"
input_title = title_6
recommended_articles = recommend_articles(input_title)
print("Recommended Articles:")
recommended_articles



Recommended Articles:


Unnamed: 0,page_ptr_id,title,published_at,distances,intro
3045,3109,AirAsia Cambodia Enhance IntraASEAN Travel Cambodian Income With Relaxed Visa Fees,2024-02-26 02:56:49.245104 UTC,[[1.0548344523128192]],In a comprehensive report by AirAsia Cambodia challenges and recommendations to improve the tourism sector have been put together which echoes longstanding issues raised by industry players
2976,1573,AirAsia Cambodia Maiden Flight TakeOff in Q4,2023-06-21 13:43:38.726044 UTC,[[1.135960892084915]],The low cost airline launched in December 2022 reiterated its flight takeoff date this fourth quarter with routes to existing AirAsia Group hubs from Cambodias airports
2897,853,AirAsia Cambodia to Take Off to Expand ASEAN Market Share,2022-12-09 10:41:02.752139 UTC,[[1.205735248258524]],Capital As fifth airline AirAsia Cambodia has numerous plans immediate ones being to strengthen connectivity to North Asia diversifying Cambodian routes and stimulating local air travel demand
1557,3258,AirAsia Cambodia Seeks Collaboration To Address High Aviation Costs,2024-03-19 00:32:03.1 UTC,[[1.229005693780206]],AirAsia Cambodia launches domestic ticket sales as CEO Nam Vissoth raises high aviation costs as a key concern urging collaboration with stakeholders to find solutions
2970,3256,AirAsia Cambodia To Take Off in May Amid Aviation Costs Setback,2024-03-18 01:47:40.509105 UTC,[[1.252984048135056]],The lowcost carriers Cambodian joint venture launches its maiden flight in May while calling for preferential rates to offset ticket fares
168,3003,Thailand Reduces ReEntry Visa Fees for Migrant Workers,2024-02-10 01:49:11.796142 UTC,[[1.2551286949051588]],Thailand has announced plans to slash the visa fee to 13 for Cambodian workers reentering the nation after returning home to celebrate Khmer New Year


Algorithm : Word of word with Jaccard similarity

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd

# Define TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the combined processed text data
tfidf_matrix = tfidf_vectorizer.fit_transform(data['combined_processed'])

# Adjust the recommend_articles function to use Jaccard similarity
def jaccard_similarity(vector1, vector2):
    intersection = np.logical_and(vector1, vector2).sum()
    union = np.logical_or(vector1, vector2).sum()
    return intersection / union if union != 0 else 0

def recommend_articles(input_text, num_recommendations=6):
    # Preprocess the input text
    input_processed = preprocess_text(input_text)
    
    # Transform the input text using the TF-IDF vectorizer
    input_vectorized = tfidf_vectorizer.transform([input_processed])
    
    # Calculate Jaccard similarity with all articles
    similarities = [jaccard_similarity(input_vectorized.toarray().flatten(), article.toarray().flatten()) for article in tfidf_matrix]
    
    # Sort the indices based on similarity
    sorted_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)
    
    # Get the recommended articles based on similarity
    recommended_articles = data.iloc[sorted_indices[:num_recommendations]].copy()  
    
    # Add similarities to recommended_articles
    recommended_articles.loc[:, 'similarities'] = [similarities[i] for i in sorted_indices[:num_recommendations]]  
    
    # Set display option for unlimited column width
    pd.set_option('display.max_colwidth', None)
    
    # Return the recommended articles
    return recommended_articles[['page_ptr_id','title','published_at', 'similarities', 'intro']]

# # Example usage
# input_title = "Annual HEINEKEN Sustainathon Invites University Students to Innovate for a Circular Cambodia"
# recommended_articles = recommend_articles(input_title)
# print("Recommended Articles:")
# recommended_articles
# Example usage
title_1 = "Annual HEINEKEN Sustainathon Invites University Students to Innovate for a Circular Cambodia"
title_2 = "For Social Platforms, the Outage Was Short. But People’s Stories Vanished, and That’s No Small Thing"
title_3 = "Taylor Swift Struck a Deal With Singapore Not to Perform in Any Other Southeast Asian Country"
title_4 = "UK University Students to Empower Cambodian Girls with STEM Skills"
title_5 = "Court Detains Lamborghini Driver Behind Fatal Collision"
title_6 = "AirAsia Cambodia: Enhance Intra-ASEAN Travel, Cambodian Income With Relaxed Visa Fees"
input_title = title_6
recommended_articles = recommend_articles(input_title)
print("Recommended Articles:")
recommended_articles



Recommended Articles:


Unnamed: 0,page_ptr_id,title,published_at,similarities,intro
1399,1392,Sar Kheng Tests Positive for Covid19,2023-05-12 07:46:38.863575 UTC,0.040404,The Deputy Prime Minister is isolating after testing positive for Covid19 amid Cambodia hosting the SEA Games
1645,558,More than 2000 Expats Apply for Free Angkor Pass,2022-09-24 03:44:10.619608 UTC,0.040323,More than 2000 expats who have called Cambodia home for two years or more have applied for a free Angkor Archaeological Park pass as part of a move to drive domestic tourism in Siem Reap
2684,815,Cambodia and Kuwait to Strengthen Bilateral Ties,2022-12-01 08:14:16.213447 UTC,0.035714,Officials from Cambodia and Kuwait have agreed to tighten bilateral trade and investment as the Kingdom welcomes a new Kuwaiti Ambassador to Cambodia
938,997,PMs Personal Call for China to Build HighSpeed Railway,2023-01-16 12:37:20.547783 UTC,0.034884,The Cambodian Prime Minister has said he will call on China to help build a highspeed railway during a visit that marks the third anniversary of his trip to the country amid Covid19 chaos
2829,713,Ministry of Post and Telecommunications Signs Strategic Cooperation Agreement with Viettel Cambodia,2022-11-09 07:47:13.109 UTC,0.034091,The Ministry of Post and Telecommunications has signed an agreement with Viettel Cambodia Metfone to develop Cambodias telecommunications industry develop data centers and promote other digital activities
1474,1077,Bilateral Political Consultations Start Between France and Cambodia,2023-02-08 15:31:35.389618 UTC,0.033333,The first session of bilateral political consultations between Cambodia and France have started with economic and political ties leading discussions
