In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
from scipy.spatial.distance import cosine

## Loading csv data

In [2]:
data_file = "./data/articles.csv"
processed_database = pd.read_csv(data_file, sep=';')

In [3]:
processed_database

Unnamed: 0,Title,Content
0,H&R,h & r may refer h & r block tax preparation co...
1,Oeceoclades longebracteata,oeceoclades longebracteata specie terrestrial ...
2,Chelsea filter,gemmology chelsea filter dichromatic optical f...
3,BC Politekhnik,bc politekhnik ukrainian політехнік ukrainian ...
4,Béatrice Roullaud,béatrice roullaud born june 9 1960 french lawy...
...,...,...
995,List of mayors in Canada,index various list mayor canadian municipality...
996,Ariel Rosada,javier ariel rosada born 11 april 1978 campana...
997,"Cohn House (Folsom, California)",cohn house folsom california building queen an...
998,Tino Scotti,tino scotti 16 november 1905 – 16 october 1984...


## Some quick glances at the data

In [4]:
titles = processed_database["Title"]
contents = processed_database["Content"]

In [5]:
titles

0                                  H&R
1           Oeceoclades longebracteata
2                       Chelsea filter
3                       BC Politekhnik
4                    Béatrice Roullaud
                    ...               
995           List of mayors in Canada
996                       Ariel Rosada
997    Cohn House (Folsom, California)
998                        Tino Scotti
999                          Stevenson
Name: Title, Length: 1000, dtype: object

In [6]:
contents

0      h & r may refer h & r block tax preparation co...
1      oeceoclades longebracteata specie terrestrial ...
2      gemmology chelsea filter dichromatic optical f...
3      bc politekhnik ukrainian політехнік ukrainian ...
4      béatrice roullaud born june 9 1960 french lawy...
                             ...                        
995    index various list mayor canadian municipality...
996    javier ariel rosada born 11 april 1978 campana...
997    cohn house folsom california building queen an...
998    tino scotti 16 november 1905 – 16 october 1984...
999    stevenson english language patronymic surname ...
Name: Content, Length: 1000, dtype: object

In [7]:
# ngram_range - lower and upper bounds on ngram lengths used
# performs some preprocessing - e.g. by default makes everything lowercase
count_vectorizer = CountVectorizer(ngram_range=(1,1), stop_words='english')
# more efficient than fit and then transform
counted_data = count_vectorizer.fit_transform(contents)

In [8]:
document_term_matrix=pd.DataFrame(counted_data.toarray(), columns=count_vectorizer.get_feature_names_out(), index=titles)

In [9]:
document_term_matrix

Unnamed: 0_level_0,00,000,0000293349,0000592613,00071,000th,001,0019106,0028643960,005,...,은율,이호재,입학,입학례,장연,재령,제이레빗,평산,해주,황주
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
H&R,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Oeceoclades longebracteata,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Chelsea filter,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BC Politekhnik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Béatrice Roullaud,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
List of mayors in Canada,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Ariel Rosada,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Cohn House (Folsom, California)",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Tino Scotti,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [106]:
for i in range(len(document_term_matrix)):
    # tells which terms are in a document
    print(np.where(document_term_matrix.iloc[i]>0))

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8, 10, 11, 13, 14, 16, 17, 18, 19,
       20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32, 33, 35, 36, 37, 39,
       40, 41, 42, 44, 45], dtype=int64),)
(array([12, 15, 21, 26, 27, 28, 32, 35, 42, 43], dtype=int64),)
(array([ 9, 30, 32, 34, 38], dtype=int64),)


In [31]:
document_term_matrix.columns

Index(['00', '000', '0000293349', '0000592613', '00071', '000th', '001',
       '0019106', '0028643960', '005',
       ...
       '은율', '이호재', '입학', '입학례', '장연', '재령', '제이레빗', '평산', '해주', '황주'],
      dtype='object', length=43138)

## Computing tf-idf

In [11]:
tfidf_vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=False) 
documents_TFIDF = pd.DataFrame(tfidf_vectorizer.fit_transform(contents).toarray(), index=titles, columns = tfidf_vectorizer.get_feature_names_out())

In [12]:
documents_TFIDF["wave"]

Title
H&R                                0.0
Oeceoclades longebracteata         0.0
Chelsea filter                     0.0
BC Politekhnik                     0.0
Béatrice Roullaud                  0.0
                                  ... 
List of mayors in Canada           0.0
Ariel Rosada                       0.0
Cohn House (Folsom, California)    0.0
Tino Scotti                        0.0
Stevenson                          0.0
Name: wave, Length: 1000, dtype: float64

## Example search

In [13]:
def search(query: str, idf: TfidfVectorizer, idfdf: pd.DataFrame, results_returned: int = 10):
    # transform() can accept multiple documents/searches
    # calculating tf-idf representation of the query
    # returns: pairs (index of transformed document
    #                       - in this function, we only search transform a single document,
    #                       the query, so that's always 0 here;
    #                 index of a term present both in the query and in the database/tf-idf dataframe
    #                 )
    # and the tf-idf value corresponding to that pair (document-term)
    query = idf.transform([query])
    # transforming the query to the familiar array format
    # in relation to the just obtained result - we get the values at the positions corresponding
    # to the term indices, and 0s everywhere else
    # [0] used, because we assume that we pose a single query
    query = query.toarray()[0]
    # now the query is represented in the same tf-idf space, as the documents in the databse

    # Getting the most similar documents
    # for each row/document in the database, compute the cosine DISTANCE between it and the query
    # important note: that is equal to 1-cos(angle between the vectors)
    # then, sort the resulting dataframe by those values
    # We want the documents such that the cosine distance between then and the query is low
    # Therefore, we sort the results in ascending orfer, and pick a given number of the least distant ones
    top_matches = idfdf.apply(lambda x: cosine(x, query), axis=1).sort_values(ascending=True)[:results_returned]
    # returning 1-results, so that they represent the similarities batween the documents and the query,
    # NOT the distance
    return 1-top_matches

In [14]:
search("wave university", tfidf_vectorizer, documents_TFIDF)

Title
Bombora                                 0.240898
John McMillan (economist)               0.163281
Wang Da-hong                            0.126520
American University of Nigeria          0.125200
Martin Lauritzen                        0.119925
Tribhuvan University Central Library    0.116140
Vernon Cook                             0.102704
Henry M. Hoenigswald                    0.095114
Mason Science College                   0.088161
Elizabeth Esteve-Coll                   0.082796
dtype: float64

## Refining the pipeline

In [15]:
def initialise_tfidf_database(path_to_csv: str):
    # load database file
    # the documents are assumed to have been processed already
    documents_dataframe = pd.read_csv(path_to_csv, sep=';')
    documents_titles = documents_dataframe["Title"]
    documents_contents = documents_dataframe["Content"]

    # prepare tf-idf objects
    # smooth_idf=True: adds 1 to all term occurrences. Prevents dividing by 0
    tfidf_vectorizer = TfidfVectorizer(smooth_idf=True)
    # transform document contents to tf-idf space
    tfidf_data = tfidf_vectorizer.fit_transform(documents_contents).toarray()
    distinct_tfidf_terms = tfidf_vectorizer.get_feature_names_out()
    documents_dataframe_TFIDF = pd.DataFrame(tfidf_data, index = documents_titles, columns = distinct_tfidf_terms)

    return tfidf_vectorizer, documents_dataframe_TFIDF