In [1]:
import requests
import queue
import random
from time import sleep
import bs4
import nltk
import re

from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


from nltk.tokenize import word_tokenize, wordpunct_tokenize

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
from scipy.spatial.distance import cosine
import seaborn as sns

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Maciej\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Maciej\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Maciej\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Maciej\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
porter = PorterStemmer()

We start with 3 functions: to get text from paragraphs from link, to preprocess text and dfs in which we use previous two funcitons and store the data

In [3]:
def getText(url):
    response = requests.get(url)
    parsed = bs4.BeautifulSoup(response.text)
    output = ""
    for p in parsed.select('p'):
        output += p.getText() 
    return output

def custom_stemmer(string):
    reasonable_words = []
    string = re.sub(r'[^\w\s]','',string) #remove punctuation
    string = string.replace('_', '') #and _
    for x in wordpunct_tokenize(string):
        if x not in stopwords.words('english'): #remove stopwords
            if not any(char.isdigit() for char in x): #remove digits
                if x.isascii(): #remove non-english words
                    reasonable_words.append(porter.stem(x))
    return reasonable_words

def dfs(link, treshold):
    documents = []
    finallinks = []
    q = queue.LifoQueue()
    q.put(link)
    visited = [link]
    c = 0
    while not q.empty() and c<treshold:
        url = q.get()
        text = getText(url)
        if len(text) < 100:
            continue
        c += 1
        finallinks.append(url)
        words = custom_stemmer(text)
        finaltext = ' '.join(words)
        documents.append(finaltext)
        response = requests.get(url)
        parsed = bs4.BeautifulSoup(response.text)
        links = parsed.find_all('a', attrs={'href': re.compile(r'^/wiki')})
        random.shuffle(links)
        for i in links:
            if 'https://en.wikipedia.org/' + i['href'] not in visited:
                q.put('https://en.wikipedia.org/' + i['href'])
                visited.append('https://en.wikipedia.org/' + i['href'])
    return documents, finallinks

In [4]:
#no need to run this cell
url = 'https://en.wikipedia.org/wiki/Pozna%C5%84_University_of_Technology'
documents, indexes = dfs(url, 1000)

Here we create dataframe with number of each word for each article

In [5]:
#no need to run this cell
CountVec = CountVectorizer(ngram_range=(1,1), stop_words='english')
CountData = CountVec.fit_transform(documents)

In [6]:
#no need to run this cell
df=pd.DataFrame(CountData.toarray(), columns=CountVec.get_feature_names_out(), index=indexes)
df

Unnamed: 0,aa,aaa,aachen,aacsb,aadhi,aadt,aaf,aafsat,aafwid,aaharoniwmf,...,zvezdo,zvi,zvonimir,zweig,zwingli,zwischenzug,zygot,zyklon,zz,zzeem
https://en.wikipedia.org/wiki/Pozna%C5%84_University_of_Technology,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
https://en.wikipedia.org//wiki/Tomsk_Polytechnic_University,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
https://en.wikipedia.org//wiki/Technical_University_of_Munich,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
https://en.wikipedia.org//wiki/Great_Depression,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
https://en.wikipedia.org//wiki/Visual_art_of_the_United_States,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
https://en.wikipedia.org//wiki/Category:Danish_reality_television_series,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
https://en.wikipedia.org//wiki/Robinson_Ekspeditionen_2003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
https://en.wikipedia.org//wiki/Expedici%C3%B3n_Robinson_(Ecuadorian_TV_series),0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"https://en.wikipedia.org//wiki/Tena,_Ecuador",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


And then save it to csv in order to store it read it later

In [7]:
#no need to run this cell
df.to_csv('count.csv')

In [8]:
df = pd.read_csv('count.csv', index_col=0)

We counted frequency of each word in all articles and show 10 most frequent words

In [9]:
df2 = df.sum(axis=0)
df2 = df2.sort_values(ascending=False)
df2 = df2.head(n=10)
print(df2.head(n=10))

use              4373
new              2926
articl           2672
page             2605
state            2404
includ           2383
talk             2107
thi              1994
time             1907
utcreplyrepli    1757
dtype: int64


Next we create dataframe with tf-idf values 

In [10]:
#no need to run this cell
tfidf=TfidfVectorizer(use_idf=True, smooth_idf=False) 
 
dfTFIDF = pd.DataFrame(tfidf.fit_transform(documents).toarray(), index=indexes, columns=tfidf.get_feature_names_out())
dfTFIDF

Unnamed: 0,aa,aaa,aachen,aacsb,aadhi,aadt,aaf,aafsat,aafwid,aaharoniwmf,...,zvezdo,zvi,zvonimir,zweig,zwingli,zwischenzug,zygot,zyklon,zz,zzeem
https://en.wikipedia.org/wiki/Pozna%C5%84_University_of_Technology,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
https://en.wikipedia.org//wiki/Tomsk_Polytechnic_University,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
https://en.wikipedia.org//wiki/Technical_University_of_Munich,0.0,0.0,0.0,0.017032,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
https://en.wikipedia.org//wiki/Great_Depression,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
https://en.wikipedia.org//wiki/Visual_art_of_the_United_States,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
https://en.wikipedia.org//wiki/Category:Danish_reality_television_series,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
https://en.wikipedia.org//wiki/Robinson_Ekspeditionen_2003,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
https://en.wikipedia.org//wiki/Expedici%C3%B3n_Robinson_(Ecuadorian_TV_series),0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"https://en.wikipedia.org//wiki/Tena,_Ecuador",0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


And store it in csv as well

In [11]:
#no need to run this cell
dfTFIDF.to_csv('data.csv')

In [12]:
dfTFIDF = pd.read_csv('data.csv', index_col=0)

And there are words with the highest and lowest idf value

In [13]:
pd.Series(tfidf.idf_, index=tfidf.get_feature_names_out()).sort_values()

the             1.224394
thi             1.462035
may             1.583396
also            1.629234
use             1.648174
                  ...   
interconvers    7.907755
intercompani    7.907755
intercommun     7.907755
interfaci       7.907755
zzeem           7.907755
Length: 41180, dtype: float64

Now, we create function which takes list of links of previously visited articles (don't need to be in our dataframe), number of matches that you would like to get from our database and database and returns links to the most similar articles with cosine similarity value

In [14]:
def recommend(visited, nomatches, dfTFIDF):
    results = []
    for i in visited:
        text = getText(i)
        words = custom_stemmer(text)
        finaltext = ' '.join(words)
        finaltext = tfidf.transform([finaltext]).toarray()[0]
        results.append(1-dfTFIDF.apply(lambda x: cosine(x, finaltext), axis=1))
    ranking = results[0].copy()
    for i in results[1:]:
        ranking += i
    ranking = ranking/3
    ranking = ranking.sort_values(ascending=False)
    return ranking[:nomatches]

Here we present some examples of how to use our algorithm

In [15]:
visited = ['https://en.wikipedia.org/wiki/Adam_Mickiewicz_University_in_Pozna%C5%84', 
           'https://en.wikipedia.org//wiki/Queen%27s_University_at_Kingston', 
           'https://en.wikipedia.org/wiki/Technical_University_of_Munich']
print(recommend(visited, 3, dfTFIDF))

https://en.wikipedia.org//wiki/Technical_University_of_Munich                0.525053
https://en.wikipedia.org/wiki/Pozna%C5%84_University_of_Technology           0.383479
https://en.wikipedia.org//wiki/University_of_Puerto_Rico_at_Mayag%C3%BCez    0.330960
dtype: float64


We started creating our database with article about PUT, so in this example we used articles about different universities: some university, technical university and university from Poznan and we can see that results are very similar with TUM at the beggining as it is identical to one of the provided articles, and PUT as we thought we will get. 

In [16]:
visited = ['https://en.wikipedia.org/wiki/Sword', 
           'https://en.wikipedia.org/wiki/Ecuador', 
           'https://en.wikipedia.org/wiki/The_Lord_of_the_Rings']
print(recommend(visited, 3, dfTFIDF))

https://en.wikipedia.org//wiki/United_States#History    0.174650
https://en.wikipedia.org//wiki/20th_century             0.133956
https://en.wikipedia.org//wiki/Great_Depression         0.110550
dtype: float64


In this exmaple we tried to choose completly unrelated topics and we can see that we got unrelated output as well with low idf values