Algorithm 1:TF-IDF of Related Article

In [1]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the cleaned data
data = pd.read_csv('cleaned_Kiripost_data_110324.csv')

# Load Spacy model
nlp = spacy.load('en_core_web_sm')

# Tokenize, lemmatize, and remove stop words
def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop]
    return ' '.join(tokens)

# Apply preprocessing to introduction, title, and body columns
data['intro_processed'] = data['intro'].apply(preprocess_text)
data['title_processed'] = data['title'].apply(preprocess_text)
data['body_processed'] = data['body'].apply(preprocess_text)

# Combine processed text columns
data['combined_processed'] = data['intro_processed'] + ' ' + data['title_processed'] + ' ' + data['body_processed']

# Vectorize text using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['combined_processed'])

# Calculate cosine similarity matrix
cosine_sim = cosine_similarity(X, X)



In [11]:
def recommend_articles(input_text, num_recommendations=5):
    # Find the row where the title matches the input_text
    row = data[data['title'] == input_text]

    # Check if there is a matching row in the dataset
    if len(row) > 0:
        # Concatenate the title, intro, and body of the row
        input_processed = preprocess_text(row['title'].values[0] + ' ' + row['intro'].values[0] + ' ' + row['body'].values[0])
    else:
        # Preprocess the input text directly
        input_processed = preprocess_text(input_text)
    # Vectorize the concatenated text
    
    input_vectorized = vectorizer.transform([input_processed])
    
    # Calculate cosine similarity with all articles
    similarity_scores = cosine_similarity(input_vectorized, X)
    
    # Sort the indices based on similarity scores
    sim_indices = similarity_scores.argsort()[0][::-1]
    
    # Get the recommended articles based on similarity
    recommended_articles = data.iloc[sim_indices[:num_recommendations]].copy()  # Make a copy to avoid modifying the original DataFrame
    
    # Add similarity scores to recommended_articles
    recommended_articles.loc[:, 'similarity_scores'] = similarity_scores[0][sim_indices[:num_recommendations]]  # Using .loc to set values
    
    # Set display option for unlimited column width
    pd.set_option('display.max_colwidth', None)
    
    # Return the recommended articles
    return recommended_articles[['page_ptr_id','title','published_at', 'similarity_scores', 'intro']]

# Example usage
input_title = "For Social Platforms the Outage Was Short But Peoples Stories Vanished and Thats No Small Thing"
recommended_articles = recommend_articles(input_title)
print("Recommended Articles:")
recommended_articles


Recommended Articles:


Unnamed: 0,page_ptr_id,title,published_at,similarity_scores,intro
1966,3186,For Social Platforms the Outage Was Short But Peoples Stories Vanished and Thats No Small Thing,2024-03-07 14:08:43.740950 UTC,1.0,Once upon a time there was a brief outage on some social media platforms It got fixed The end On the face of it kind of a boring story
642,3170,Metas Meltdown Resolved After Hours of Outage,2024-03-06 03:20:47.333427 UTC,0.247168,Metas meltdown has been resolved after hours of being unable to access Facebook Facebook Messenger and Instagram worldwide
395,3166,BREAKING Meta platforms including Facebook Instagram and Messenger are Experiencing a Global Outage,2024-03-05 15:57:38.540206 UTC,0.239026,According to Downdetectorcom Facebook Instagram and Facebook Messenger experienced outages within the last 24 hours
1235,1654,Cambodian Threads Users Share Their Thoughts,2023-07-11 04:10:19.155000 UTC,0.203291,Metas latest social media app Threads smashed records when it launched on Thursday Kiripost speaks to Cambodian artists academics students and digital experts to get their perspectives on the latest social media sensation
906,215,RESYNC Hybrid Storytelling Program Aims to Unify Peace and Art,2022-05-30 03:02:08.708354 UTC,0.172728,The RESYNC training program aims to fuse together art and peace by telling peoples lesserknown stories and adding a human touch to filmmaking


In [4]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class ContentBasedRecommender:
    def __init__(self, data_path):
        self.data = pd.read_csv(data_path)
        self.nlp = spacy.load('en_core_web_sm')
        self.vectorizer = TfidfVectorizer()
        self._preprocess_data()

    def _preprocess_text(self, text):
        doc = self.nlp(text)
        tokens = [token.lemma_ for token in doc if not token.is_stop]
        return ' '.join(tokens)

    def _preprocess_data(self):
        self.data['intro_processed'] = self.data['intro'].apply(self._preprocess_text)
        self.data['title_processed'] = self.data['title'].apply(self._preprocess_text)
        self.data['body_processed'] = self.data['body'].apply(self._preprocess_text)
        self.data['combined_processed'] = self.data['intro_processed'] + ' ' + self.data['title_processed'] + ' ' + self.data['body_processed']
        self.X = self.vectorizer.fit_transform(self.data['combined_processed'])

    def recommend_articles(self, input_text, num_recommendations=11):
        input_processed = self._preprocess_text(input_text)
        input_vectorized = self.vectorizer.transform([input_processed])
        similarity_scores = cosine_similarity(input_vectorized, self.X)
        sim_indices = similarity_scores.argsort()[0][::-1]
        recommended_articles = self.data.iloc[sim_indices[:num_recommendations]].copy()
        recommended_articles['similarity_scores'] = similarity_scores[0][sim_indices[:num_recommendations]]
        pd.set_option('display.max_colwidth', None)
        return recommended_articles[['page_ptr_id', 'title', 'published_at', 'similarity_scores', 'intro']]
# Example usage
data_path = 'cleaned_Kiripost_data_110324.csv'
recommender = ContentBasedRecommender(data_path)

In [6]:
input_title = "Annual HEINEKEN Sustainathon Invites University Students to Innovate for a Circular Cambodia"
recommended_articles = recommender.recommend_articles(input_title)
print("Recommended Articles:")
recommended_articles


Recommended Articles:


Unnamed: 0,page_ptr_id,title,published_at,similarity_scores,intro
1885,1378,Winners Crowned for Sustainable Innovations in Heineken Sustainathon,2023-05-10 07:15:47.080993 UTC,0.442767,Twenty teams competed in the inaugural Heineken Sustainathon with three teams winning the top spot for their innovations in the sustainabilityfocused competition
782,1974,Heineken Cambodia Appoints Andy Hewson as New Managing Director,2023-09-07 08:01:22.696612 UTC,0.286448,HEINEKEN Cambodia today announces the official appointment of Andy Hewson as its new Managing Director Andy has assumed his new role since 1 August 2023 reporting to Uday Sinha HEINEKENs Regional Managing Director for the AsiaPacific APAC region
667,2911,Heineken Partners with Grab to Drive Home Responsible Consumption Message in Cambodia,2024-01-28 06:09:35.586507 UTC,0.276479,Heineken Cambodia and Southeast Asias leading superapp Grab Cambodia are joining forces to empower consumers to enjoy responsibly and not drink and drive
1666,2105,Heinekens Worlds Together Campaign Brings together Community Gatherings,2023-10-02 04:35:10.108869 UTC,0.259717,The community of Kratie kindly hosted local officials dignitaries from the Royal Government of Cambodias Ministry of Environment employees of HEINEKEN Cambodia and environmental conservation experts from WWFCambodia recently
934,1603,Heineken Commemorates 150 Years of Good Times,2023-06-28 08:23:11.535000 UTC,0.23861,Iconic beer brand Heineken is celebrating 150 years of serving consumers worldwide and it is marking this momentous occasion with a series of events in Cambodia in June and July


Algorithm 2: Word2Vector

In [1]:
import pandas as pd
import spacy
import gensim.downloader as api
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load the cleaned data
data = pd.read_csv('cleaned_Kiripost_data_110324.csv')

# Load Spacy model
nlp = spacy.load('en_core_web_sm')

# Load Word2Vec model
word_vectors = api.load("word2vec-google-news-300")

# Tokenize, lemmatize, and remove stop words
def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop]
    return tokens

# Convert tokens to word vectors and aggregate
def tokens_to_vectors(tokens):
    vectors = []
    for token in tokens:
        if token in word_vectors.key_to_index:
            vectors.append(word_vectors[token])
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(word_vectors.vector_size)
# Apply preprocessing to introduction, title, and body columns and combine them
data['combined_processed'] = data['intro'].apply(preprocess_text).apply(lambda x: ' '.join(x)) + ' ' + \
                             data['title'].apply(preprocess_text).apply(lambda x: ' '.join(x)) + ' ' + \
                             data['body'].apply(preprocess_text).apply(lambda x: ' '.join(x))
# Apply tokenization and vectorization
data['tokens'] = data['combined_processed'].apply(preprocess_text)
data['vectors'] = data['tokens'].apply(tokens_to_vectors)

# Calculate cosine similarity matrix
vectors = np.vstack(data['vectors'].to_numpy())
# cosine_sim = cosine_similarity(vectors, vectors)
# # Check if there is a matching row in the dataset
# input_text =  "Annual HEINEKEN Sustainathon Invites University Students to Innovate for a Circular Cambodia"
# row = data[data['title'] == input_text]
# if len(row) > 0:
#     # Concatenate the title, intro, and body of the row
#     input_processed = preprocess_text(row['title'].values[0] + ' ' + row['intro'].values[0] + ' ' + row['body'].values[0])
# else:
#     # Preprocess the input text directly
#     input_processed = preprocess_text(input_text)
# print(input_processed)

In [11]:
# Calculate cosine similarity matrix
cosine_sim = cosine_similarity(data['vectors'].tolist(), data['vectors'].tolist())
from collections import Counter
def recommend_articles(input_text, num_recommendations=11):
    # Preprocess the input text directly
    row = data[data['title'] == input_text]
    if len(row) > 0:
        # Concatenate the title, intro, and body of the row
        input_processed = preprocess_text(row['title'].values[0] + ' ' + row['intro'].values[0] + ' ' + row['body'].values[0])
    else:
        # Preprocess the input text directly
        input_processed = preprocess_text(input_text)
    
    # Vectorize the input text
    input_vectorized = tokens_to_vectors(input_processed)
    # Count occurrences of each word
    # word_counts = Counter(input_processed)
    # # Sort word counts in descending order
    # sorted_word_counts = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
    # # Extract top 20 words and their counts
    # top_words = [word for word, _ in sorted_word_counts[:20]]
    # # Vectorize the input text based on the occurrences of top words
    # # input_vectorized = np.array([word_counts[word] if word in word_counts else 0 for word in top_words])
    # print(sorted_word_counts)
    # # Display word counts
    # for word, count in sorted_word_counts[:20]:
    #     print(f"{word}: {count}")
    
    # Calculate cosine similarity with all articles
    similarity_scores = cosine_similarity([input_vectorized], data['vectors'].tolist())[0]
    
    # Sort the indices based on similarity scores
    sim_indices = similarity_scores.argsort()[::-1]
    
    # Get the recommended articles based on similarity
    recommended_articles = data.iloc[sim_indices[:num_recommendations]].copy()  # Make a copy to avoid modifying the original DataFrame
    
    # Add similarity scores to recommended_articles
    recommended_articles['similarity_scores'] = similarity_scores[sim_indices[:num_recommendations]]
    
    # Set display option for unlimited column width
    pd.set_option('display.max_colwidth', None)
    print(input_processed)
    # Return the recommended articles
    return recommended_articles[['page_ptr_id', 'title', 'published_at', 'similarity_scores', 'intro']]

# Example usage
input_title = "AirAsia Cambodia: Enhance Intra-ASEAN Travel, Cambodian Income With Relaxed Visa Fees"
recommended_articles = recommend_articles(input_title)
print("Recommended Articles:")
recommended_articles

['AirAsia', 'Cambodia', ':', 'Enhance', 'Intra', '-', 'ASEAN', 'Travel', ',', 'Cambodian', 'Income', 'Relaxed', 'Visa', 'fee']
Recommended Articles:


Unnamed: 0,page_ptr_id,title,published_at,similarity_scores,intro
657,506,Expansion of KHR CrossBorder Payments by QR Code in ASEAN,2022-09-12 10:01:43.918000 UTC,0.673167,Cambodians will soon be able to pay via QR code using the Khmer riel in Vietnam and Laos after ACLEDA Bank was selected by NBC to be the sponsoring bank for crossborder payments
825,1573,AirAsia Cambodia Maiden Flight TakeOff in Q4,2023-06-21 13:43:38.726044 UTC,0.666209,The low cost airline launched in December 2022 reiterated its flight takeoff date this fourth quarter with routes to existing AirAsia Group hubs from Cambodias airports
838,1502,ThaiQR Accessible at 15m Cambodian Retailers,2023-06-10 04:41:17.410322 UTC,0.664617,The national banks of Thailand and Cambodia have struck a deal that enables Thai visitors to purchase products in Cambodia from their domestic bank account via QR code
402,1561,Official Launch of GLN Cross Border CPM QR Payments in Cambodia,2023-06-20 07:46:15.740000 UTC,0.655296,ACLEDA Bank has launched GLN CPM Cross Border QR payments meaning digital wallet users from Korea can now make mobile payments in Cambodia
1493,1659,CrossBorder QR Payments Between China and Cambodia Launch,2023-07-11 09:44:47.441893 UTC,0.651716,NBC and UnionPay International have inked a deal to enable crossborder QR payments between Cambodia and China via KHQR and UnionPay
1819,853,AirAsia Cambodia to Take Off to Expand ASEAN Market Share,2022-12-09 10:41:02.752139 UTC,0.65081,Capital As fifth airline AirAsia Cambodia has numerous plans immediate ones being to strengthen connectivity to North Asia diversifying Cambodian routes and stimulating local air travel demand
974,1203,ACLEDA to Introduce CrossBorder Payments in Laos and Vietnam,2023-03-17 04:00:16.481706 UTC,0.645133,ACLEDA Bank is working towards setting up digital payment systems in Laos and Vietnam to enable payments in Khmer Riel via QR codes after its successful launch in Thailand
383,3160,Positive Sign For Aviation As Emirates Restarts 7Day Service To Phnom Penh in May,2024-03-05 02:51:21.052862 UTC,0.637777,One of the worlds leading airlines return to Phnom Penh after the pandemic is viewed as positive amid an upward trend in the Kingdoms flight movements and flag carrier Cambodia Angkor Airs debut to New Delhi by June this year
1206,580,ACLEDA Customers Can Pay in riel via QR Scan in Thailand,2022-09-30 06:08:08.134000 UTC,0.635748,In a groundbreaking move ACLEDA account holders can now pay in Khmer riel via QR scan while in Thailand with four Thai banks
437,3109,AirAsia Cambodia Enhance IntraASEAN Travel Cambodian Income With Relaxed Visa Fees,2024-02-26 02:56:49.245104 UTC,0.635076,In a comprehensive report by AirAsia Cambodia challenges and recommendations to improve the tourism sector have been put together which echoes longstanding issues raised by industry players


Algorithm 3: Bag of words

In [1]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the cleaned data
data = pd.read_csv('cleaned_Kiripost_data_110324.csv')

# Load Spacy model
nlp = spacy.load('en_core_web_sm')

# Tokenize, lemmatize, and remove stop words
def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop]
    return ' '.join(tokens)

# Apply preprocessing to introduction, title, and body columns
data['intro_processed'] = data['intro'].apply(preprocess_text)
data['title_processed'] = data['title'].apply(preprocess_text)
data['body_processed'] = data['body'].apply(preprocess_text)

# Combine processed text columns
data['combined_processed'] = data['intro_processed'] + ' ' + data['title_processed'] + ' ' + data['body_processed']

# Vectorize text using Bag of Words (BoW)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['combined_processed'])

# Calculate cosine similarity matrix
cosine_sim = cosine_similarity(X, X)


In [10]:
def recommend_articles(input_text, num_recommendations=11):
    # Find the row where the title matches the input_text
    row = data[data['title'] == input_text]

    # Check if there is a matching row in the dataset
    if len(row) > 0:
        # Concatenate the title, intro, and body of the row
        input_processed = preprocess_text(row['title'].values[0] + ' ' + row['intro'].values[0] + ' ' + row['body'].values[0])
    else:
        # Preprocess the input text directly
        input_processed = preprocess_text(input_text)
    # Vectorize the concatenated text
    input_vectorized = vectorizer.transform([input_processed])
    
    # Calculate cosine similarity with all articles
    similarity_scores = cosine_similarity(input_vectorized, X)
    
    # Sort the indices based on similarity scores
    sim_indices = similarity_scores.argsort()[0][::-1]
    
    # Get the recommended articles based on similarity
    recommended_articles = data.iloc[sim_indices[:num_recommendations]].copy()  # Make a copy to avoid modifying the original DataFrame
    
    # Add similarity scores to recommended_articles
    recommended_articles.loc[:, 'similarity_scores'] = similarity_scores[0][sim_indices[:num_recommendations]]  # Using .loc to set values
    print(input_processed)
    # Set display option for unlimited column width
    pd.set_option('display.max_colwidth', None)
    
    # Return the recommended articles
    return recommended_articles[['page_ptr_id','title','published_at', 'similarity_scores', 'intro']]

# Example usage
input_title = "UK University Students to Empower Cambodian Girls with STEM Skills"
recommended_articles = recommend_articles(input_title)
print("Recommended Articles:")
recommended_articles


UK University student empower cambodian girl STEM Skills female mentorship scheme launch UK university student share skill impoverished cambodian girl provide STEM skill steer future female student slate equip essential STEM skill benefit future girl mentorship program connect high school pupil impoverished area UK university studentsThe online project collaboration Cambodian Childrens Fund CCF launch International Womens Day aim empower cambodian girl skill succeed STEM Science Technology Engineering Mathematics careersfive student Neeson Cripps Academy program student Solent University England mark crosscountry exchange institution focus STEM opportunity girl STEM fieldsThe scheme aim plug exist gap woman underrepresented nation science technology industry hope pin new generation girl fill skill void CCF want champion change development STEM education girl countryLea Phorn education project officer oversee project CCF say mentoring program greatly impact student allow know world opti

Unnamed: 0,page_ptr_id,title,published_at,similarity_scores,intro
698,3199,UK University Students to Empower Cambodian Girls with STEM Skills,2024-03-10 15:55:45.719472 UTC,1.0,A female mentorship scheme has been launched that will see UK university students share their skills with impoverished Cambodian girls providing them with STEM skills to steer their future
1302,678,A STEM Classroom Is Creating Cambodias Future Scientists,2022-10-28 06:33:20.101000 UTC,0.461476,A forwardthinking academy is aiming to fill Cambodias engineering and technology skills gap by delivering handson STEM education in the classroom Kiriposts Tharum visits Neeson Cripps Academy to find out more
1074,2431,Using Photography to Explore STEAM Skills,2023-11-18 05:06:16.868000 UTC,0.373006,Its full steam ahead for 19 CCF students who have tapped into their creative skills as part of an overarching program to promote STEAM education with the fruits of their labor now on display
1391,195,Edtechs Boost Cambodias Education in PostPandemic World,2022-04-25 07:28:52.980000 UTC,0.355623,Innovative online educational startups aim to improve the nations learning landscape in the wake of Covid19 school closures
189,156,Students to be Offered Loans to Continue University Studies,2022-03-11 03:11:59.991000 UTC,0.350528,University information portal Sala Tech and AMK Microfinance Plc sign an agreement to offer Cambodian students a loan to complete higher education in a move that takes away the financial burden from parents while contributing to the countrys future development
36,424,Collaboration is Key to Cambodian Educations Pandemic Recovery,2022-08-23 09:50:52.548000 UTC,0.344367,Collaboration within the education sector is key to fill the huge learning gap created by the pandemic with a push needed for digital literacy entrepreneurship skills and technological readiness a new roadmap reveals
134,3065,ACLEDA Institute of Business Achieves University Status,2024-02-19 10:58:46.648166 UTC,0.338345,ACLEDA Institute of Business is celebrating a new chapter in a 24year journey after receiving official university status transforming into ACLEDA University of Business
1808,2113,Cambodian Students Studying in US Set to Increase Ambassador Predicts,2023-10-02 23:01:59.796843 UTC,0.336669,The number of Cambodian students pursuing tertiary education in the US has increased since 2012 with US Ambassador W Patrick Murphy confident that the figure will continue to rise in the future
1344,1682,Providing Young Cambodians with Skills to Thrive,2023-07-18 10:32:37.052155 UTC,0.331089,Since 2013 Siem Reap NGO PEPY Empowering Youth has equipped more than 500 young Cambodians with skills training in entrepreneurship and digital literacy to help their businesses shine
1726,2618,Snoopedu Empowers STEM Education,2023-12-17 03:05:46.529000 UTC,0.317622,Driven by her passion for education innovation and technology Poum Chansocheata has founded Snoopedu Education which is fueling young Cambodians passion for STEM
