### **Stemming, lemmatization**

In [37]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
import nltk
import re
import numpy as np
import requests
import bs4

from sentence_transformers import SentenceTransformer



In [38]:
def getText(url):
    headers = {
        'User-Agent': 'IR Project 1 - Web Crawler -'
        }
    
    output = ""
    response = requests.get(url, headers=headers)
    parsed = bs4.BeautifulSoup(response.text)
    for p in parsed.select('p'):
        output += p.getText()
        

    return output

In [39]:
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

True

In [40]:
def clean_text(text):
    text = str(text)
    text = re.sub(r'\[.*?\]', '', text)
    text = text.replace('\n', ' ')
    text = re.sub(r'\s+', ' ', text)
    return text.strip()


In [41]:
def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    
    tokens = word_tokenize(text)
    filtered_tokens = [w.lower() for w in tokens if w.isalpha() and w.lower() not in stop_words]
    lemmas = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    stems = [stemmer.stem(word) for word in lemmas]
    return stems

In [42]:
df = pd.read_csv("output.csv")
df['text'] = df['text'].apply(clean_text)
df


Unnamed: 0,url,name,text
0,https://en.wikipedia.org/wiki/Baldur%27s_Gate_3,Baldur's Gate 3,Baldur's Gate 3 (also known as BG3 and Baldur'...
1,https://en.wikipedia.org/wiki/List_of_Dungeons...,List of Dungeons & Dragons video games,This is a list of officially licensed video ga...
2,https://en.wikipedia.org/wiki/Non-player_chara...,Non-player character,A non-player character (NPC) is a character in...
3,https://en.wikipedia.org/wiki/Owlbear,Owlbear,An owlbear (also owl bear) is a fictional crea...
4,https://en.wikipedia.org/wiki/Overhaul_Games,Beamdog,IdeaSpark Labs Inc. (trade name: Beamdog) is a...
...,...,...,...
995,https://en.wikipedia.org/wiki/System_7,System 7,System 7 (later named Mac OS 7) is the seventh...
996,https://en.wikipedia.org/wiki/Apple_headphones,Apple headphones,Apple Inc. has produced and sold headphones si...
997,https://en.wikipedia.org/wiki/London_Stock_Exc...,London Stock Exchange,The London Stock Exchange (LSE) is a global st...
998,https://en.wikipedia.org/wiki/NBC_Sports,NBC Sports,NBC Sports is an American programming division...


In [43]:
processed_texts = [" ".join(preprocess_text(text)) for text in df['text']]
df['processed'] = processed_texts
df

Unnamed: 0,url,name,text,processed
0,https://en.wikipedia.org/wiki/Baldur%27s_Gate_3,Baldur's Gate 3,Baldur's Gate 3 (also known as BG3 and Baldur'...,baldur gate also known baldur gate iii video g...
1,https://en.wikipedia.org/wiki/List_of_Dungeons...,List of Dungeons & Dragons video games,This is a list of officially licensed video ga...,list offici licens video game use dungeon drag...
2,https://en.wikipedia.org/wiki/Non-player_chara...,Non-player character,A non-player character (NPC) is a character in...,charact npc charact game control player term o...
3,https://en.wikipedia.org/wiki/Owlbear,Owlbear,An owlbear (also owl bear) is a fictional crea...,owlbear also owl bear fiction creatur origin c...
4,https://en.wikipedia.org/wiki/Overhaul_Games,Beamdog,IdeaSpark Labs Inc. (trade name: Beamdog) is a...,ideaspark lab trade name beamdog canadian vide...
...,...,...,...,...
995,https://en.wikipedia.org/wiki/System_7,System 7,System 7 (later named Mac OS 7) is the seventh...,system later name mac o seventh major releas c...
996,https://en.wikipedia.org/wiki/Apple_headphones,Apple headphones,Apple Inc. has produced and sold headphones si...,appl produc sold headphon sinc avail standalon...
997,https://en.wikipedia.org/wiki/London_Stock_Exc...,London Stock Exchange,The London Stock Exchange (LSE) is a global st...,london stock exchang lse global stock exchang ...
998,https://en.wikipedia.org/wiki/NBC_Sports,NBC Sports,NBC Sports is an American programming division...,nbc sport american program divis nbcunivers di...


In [44]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['processed'])
sim_tfidf = cosine_similarity(tfidf_matrix)


In [45]:
embedder = SentenceTransformer('all-mpnet-base-v2')  

semantic_vectors = embedder.encode(df['text'], convert_to_numpy=True)
sim_semantic = cosine_similarity(semantic_vectors)


'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 54b2d60f-2694-4aa1-a2ed-8bb9b4b78534)')' thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-mpnet-base-v2/resolve/main/./modules.json
Retrying in 1s [Retry 1/5].
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [46]:
alpha, beta = 0.5, 0.5  
sim_total = alpha * sim_tfidf + beta * sim_semantic


In [47]:
def recommend(article_index, top_n=5):
    sim_scores = sim_total[article_index]
    top_indices = np.argsort(sim_scores)[::-1][1:top_n+1]
    return df.iloc[top_indices][['name', 'text']].assign(similarity=sim_scores[top_indices])

recommend(5, top_n=3)


Unnamed: 0,name,text,similarity
735,Baldur's Gate,Baldur's Gate is a series of role-playing vide...,0.758364
275,Baldur's Gate: Enhanced Edition,Baldur's Gate: Enhanced Edition is a 2012 role...,0.737243
521,Baldur's Gate II: Shadows of Amn,Baldur's Gate II: Shadows of Amn is a role-pla...,0.699128


In [48]:
external_articles = [
     "https://en.wikipedia.org/wiki/Baldur%27s_Gate_3",
    "https://en.wikipedia.org/wiki/Baldur%27s_Gate_III%3A_The_Black_Hound",
    "https://en.wikipedia.org/wiki/Forgotten_Realms",
]


In [49]:
external_text =[getText(a) for a in external_articles]
external_text

['\nBaldur\'s Gate 3 (also known as BG3 and Baldur\'s Gate III) is a 2023 role-playing video game by Larian Studios. It is the third installment in the Baldur\'s Gate series. The game\'s full release for Windows happened in August, with PlayStation 5, macOS, and Xbox Series X/S later in the same year. In the game\'s narrative, the party seeks to cure themselves of a parasitic tadpole infecting their brain. It can be played alone or in a group.\nAdapted from the fifth edition of tabletop role-playing game Dungeons & Dragons, Baldur\'s Gate 3 takes its mechanics and setting, the Forgotten Realms, from the tabletop game. Players create a highly customisable character and embark on quests with a party of voiced companions. Alternatively, they can play as a companion instead. The gameplay comprises real-time exploration of large areas, turn-based combat, and narrative choices which impact the party and the wider world. Outcomes for combat, dialogue and world interaction are generally determ

In [50]:
external_processed = [" ".join(preprocess_text(text)) for text in external_text]
external_processed

['baldur gate also known baldur gate iii video game larian studio third instal baldur gate seri game full releas window happen august playstat maco xbox seri later year game narr parti seek cure parasit tadpol infect brain play alon group adapt fifth edit tabletop game dungeon dragon baldur gate take mechan set forgotten realm tabletop game player creat highli customis charact embark quest parti voic companion altern play companion instead gameplay compris explor larg area combat narr choic impact parti wider world outcom combat dialogu world interact gener determin roll die baldur gate baldur gate ii shadow amn develop biowar third game subtitl black hound develop black isl cancel follow licens disput dungeon dragon owner wizard coast wotc declin larian first pitch make game follow releas divin origin sin impress materi divin origin sin ii wotc welcom new pitch eventu greenlit larian develop compani grew consider product august larian releas game first act earli access provid player f

In [51]:
tfidf_external = tfidf_vectorizer.transform(external_processed)
sim_tfidf_ext = cosine_similarity(tfidf_external, tfidf_matrix)

external_semantic_vectors = embedder.encode(external_processed, convert_to_numpy=True)
sim_semantic_ext = cosine_similarity(external_semantic_vectors, semantic_vectors)

sim_total_ext = alpha * sim_tfidf_ext + beta * sim_semantic_ext


In [52]:
def recommend(top_n=10):
    aggregated_scores = sim_total_ext.sum(axis=0)
    
    top_indices = np.argsort(aggregated_scores)[::-1][:top_n]
    recommendations = df.iloc[top_indices][['name','url']].copy()
    
    return recommendations
print(recommend())

                                  name  \
0                      Baldur's Gate 3   
735                      Baldur's Gate   
286                   Forgotten Realms   
738                   Forgotten Realms   
521   Baldur's Gate II: Shadows of Amn   
49     Baldur's Gate: Dark Alliance II   
748  Forgotten Realms Campaign Setting   
275    Baldur's Gate: Enhanced Edition   
258                 Dungeons & Dragons   
96                  Dungeons & Dragons   

                                                   url  
0      https://en.wikipedia.org/wiki/Baldur%27s_Gate_3  
735  https://en.wikipedia.org/wiki/Baldur%27s_Gate_...  
286     https://en.wikipedia.org/wiki/Forgotten_Realms  
738                https://en.wikipedia.org/wiki/Zehir  
521  https://en.wikipedia.org/wiki/Baldur%27s_Gate_...  
49   https://en.wikipedia.org/wiki/Baldur%27s_Gate:...  
748  https://en.wikipedia.org/wiki/Forgotten_Realms...  
275  https://en.wikipedia.org/wiki/Baldur%27s_Gate:...  
258  https://en.wikipedi