# INFO 371 Problem set 5: k-NN, TF-IDF

## 1 Where are these texts coming from?

In [57]:
# imports necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import operator
import math

### 1.1 Bag of words

In [71]:
# loads data from csv
data = pd.read_csv("texts.csv", error_bad_lines=False)

# prints out first few lines of data
data.head()


Unnamed: 0,name,size,lines,pagenr,text
0,balbulus-early-life-charlemagne,259062,4394,1,Title: Early Lives of Charlemagne by Eginhard ...
1,balbulus-early-life-charlemagne,259062,4394,2,"\n\nThe notes, keyed to line numbers in the so..."
2,balbulus-early-life-charlemagne,259062,4394,3,From a bronze statuette in the Musée Carnavale...
3,balbulus-early-life-charlemagne,259062,4394,4,\n _A lui finit la dissolution ...
4,balbulus-early-life-charlemagne,259062,4394,5,public opinion in regard to the meaning of fal...


In [72]:
# inspects some of the text
print(data.text.head())


0    Title: Early Lives of Charlemagne by Eginhard ...
1    \n\nThe notes, keyed to line numbers in the so...
2    From a bronze statuette in the Musée Carnavale...
3    \n                _A lui finit la dissolution ...
4    public opinion in regard to the meaning of fal...
Name: text, dtype: object


In [73]:
# prints all unique names of texts
print(data.name.unique())

# prints number of unique 
print("\n number of names:", len(data.name.unique()))


['balbulus-early-life-charlemagne' 'beesly-queen-elizabeth' 'bible'
 'carroll-alice-wonderland' 'chipman-earliest-electromagnetic-instruments'
 'cia-world-factbook-1992' 'eckstein-quintus-claudius'
 'fisher-quaker-colonies' 'gallienne-quest-of-golden-girl'
 'gordon-quiet-talks-crowned-christ' 'hardy-madding-crowd'
 'infiltrating-open-systems' 'kant-metaphysical-elements-ethics'
 'karn-snowflakes' 'milton-paradise-lost'
 'naval-academy-sound-military-decision' 'newsgroup'
 'paper-compact-hash-tables' 'paper-data-compression'
 'paper-logical-implementation-of-arithmetic'
 'paper-programming-by-example' 'paper-search-for-autonomy'
 'selected-polish-tales' 'shakespeare-as-you-like-it'
 'unamuno-tragic-sense-of-life' 'vaneeden-quest'
 'webster-early-european-history' 'why-speech-output'
 'workshop-proceedings']

 number of names: 29


In [104]:
# creates bag of words

# gets all text data and puts it in a list
text = data.text.values
text

# initializes vectorizer
vectorizer = CountVectorizer()

# fits and transforms vectorizer with words to create bag of words
bag_of_words = vectorizer.fit_transform(text.astype('U')).toarray()

# gets shape of the bag of words
print("shape:", bag_of_words.shape)

# converts bag of words to numpy array and creates dataframe from that
df = pd.DataFrame(bag_of_words, columns=vectorizer.get_feature_names())
df.head()


shape: (12924, 65060)


Unnamed: 0,00,000,0000,00000,00000000000test,00006,0001,0002,00021,00021053,...,⁸⁴,⁸⁵,⁸⁶,⁸⁷,⁸⁸,⁸⁹,⁹²,⁹³,⁹¹,⁹⁰
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


What are rows? What are columns? What are the entries? How can you extract word counts for a single text? How about a given word?

add stuff..................... rows are sentences, columns are words



In [105]:
# splits data into training and validation

# creates array with length of bag of words
rowIndices = np.arange(len(bag_of_words))

# puts a random 50 row indices into training array and a random 20 rows into validation array
training_indices, validation_indices = train_test_split(rowIndices, train_size=50, test_size=20)

# takes the random training indices and puts the corresponding rows into an array
training_data = bag_of_words[training_indices]

# converts training data into dataframe instead of arrays inside of an array
training_df = pd.DataFrame(training_data)

# gets the names of the documents in the training data
training_titles = data.name[training_indices].values

# takes the random validation indices and puts the corresponding rows into an array
validation_data = bag_of_words[validation_indices]

# converts training data into dataframe instead of arrays inside of an array
validation_df = pd.DataFrame(validation_data)

# gets the names of the documents in the validation data 
validation_titles = data.name[validation_indices].values


In [106]:
# implements cosine similarity

# function that takes two vectors and calculates their cosine similarity
def cosine_similarity(vector_x, vector_y):
  return np.matmul(vector_x, vector_y) / (np.linalg.norm(vector_x) * np.linalg.norm(vector_y))


# gets two of the same vectors from training set
vector_x = training_df.loc[1, :].values
vector_y = training_df.loc[1, :].values

# calls cosine function and prints result
print("cosine similarity with identical vectors:", cosine_similarity(vector_x, vector_y))



# gets two of the different vectors from training set
vector_x = training_df.loc[3, :].values
vector_y = training_df.loc[4, :].values

# calls cosine function and prints result
print("cosine similarity with different vectors:", cosine_similarity(vector_x, vector_y))



# gets two of the different vectors from training set
vector_x = training_df.loc[6, :].values
vector_y = training_df.loc[8, :].values

# calls cosine function and prints result
print("cosine similarity with different vectors:", cosine_similarity(vector_x, vector_y))


cosine similarity with identical vectors: 1.0000000000000002
cosine similarity with different vectors: 0.2755876295126759
cosine similarity with different vectors: 0.2227651485451691


In [112]:
# implements k-NN 

# uses parameters as the second vector and k value to calculate knn
def knn_function(y_vector, k):
    
    # creates an empty dictionary to store indices and cosine similarity values
    cos_sim_dict = {}

    # creates variable meant to indicate indices of vector in training title data
    count = 0

    # appends new value to dictionary, with the row number as the key and the cosine 
    # similarity as the value for every row in the training data
    for row in training_data:
        cos_sim_dict[count] = cosine_similarity(y_vector, row)
        count = count + 1

    # sorts dictionary by cosine similarity in descending order 
    sorted_cos_sim = sorted(cos_sim_dict.items(), key=operator.itemgetter(1), reverse=True)

    # creates list to put titles of nearest neighbors
    titles = []

    # gets titles for k nearest neighbors and stores it titles array
    for i in range(k):
        titles.append(training_titles[sorted_cos_sim[i][0]])

    # gets the count of each of the titles in decending order
    titles_freq = pd.Index(titles).value_counts()

    # selects and prints the most freq name
    return (titles_freq.index[0])


In [120]:
# calculates the percent accuracy where k = 10, train_size = 50, and test_size = 20

# get the total number of vectors in validation dataset
total_vectors_validation = validation_data.shape[1]

# creates variable to store number of predictions that were correct
num_correct = 0

# stores the index of the row
count = 0

# calls knn_function and figures out how many knn predictions were correct
for row in validation_data:
    if (knn_function(row, 10) == validation_titles[count]):
        num_correct = num_correct + 1
        count = count + 1

# calculates and prints the percent accuracy
print(num_correct / total_vectors_validation)


0.00013833384568090994


In [119]:
# increasing the training and testing sets to training set = 1000 and testing set = 100 and calculating percentage correct

# puts a random 1000 row indices into training array and a random 100 rows into validation array
training_indices, validation_indices = train_test_split(rowIndices, train_size=1000, test_size=100)

# takes the random training indices and puts the corresponding rows into an array
training_data = bag_of_words[training_indices]

# converts training data into dataframe instead of arrays inside of an array
training_df = pd.DataFrame(training_data)

# gets the names of the documents in the training data
training_titles = data.name[training_indices].values

# takes the random validation indices and puts the corresponding rows into an array
validation_data = bag_of_words[validation_indices]

# converts training data into dataframe instead of arrays inside of an array
validation_df = pd.DataFrame(validation_data)



# gets the names of the documents in the validation data 
validation_titles = data.name[validation_indices].values

# get the total number of vectors in validation dataset
total_vectors_validation = validation_data.shape[1]

# creates variable to store number of predictions that were correct
num_correct = 0

# stores the index of the row
count = 0

# calls knn_function and figures out how many knn predictions were correct
for row in validation_data:
    if (knn_function(row, 10) == validation_titles[count]):
        num_correct = num_correct + 1
        count = count + 1

# calculates and prints the percent accuracy
print(num_correct / total_vectors_validation)


0.00013833384568090994


In [115]:
# compares different k values - 1, 5, 25

# where k = 1

# gets the names of the documents in the validation data 
validation_titles = data.name[validation_indices].values

# get the total number of vectors in validation dataset
total_vectors_validation = validation_data.shape[1]

# creates variable to store number of predictions that were correct
num_correct = 0

# stores the index of the row
count = 0

# calls knn_function and figures out how many knn predictions were correct
for row in validation_data:
    if (knn_function(row, 1) == validation_titles[count]):
        num_correct = num_correct + 1
    count = count + 1

# calculates and prints the percent accuracy
print("k = 1:", num_correct / total_vectors_validation)



# where k = 5

# gets the names of the documents in the validation data 
validation_titles = data.name[validation_indices].values

# get the total number of vectors in validation dataset
total_vectors_validation = validation_data.shape[1]

# creates variable to store number of predictions that were correct
num_correct = 0

# stores the index of the row
count = 0

# calls knn_function and figures out how many knn predictions were correct
for row in validation_data:
    if (knn_function(row, 1) == validation_titles[count]):
        num_correct = num_correct + 1
    count = count + 1

# calculates and prints the percent accuracy
print("k = 5:", num_correct / total_vectors_validation)

# where k = 25

# gets the names of the documents in the validation data 
validation_titles = data.name[validation_indices].values

# get the total number of vectors in validation dataset
total_vectors_validation = validation_data.shape[1]

# creates variable to store number of predictions that were correct
num_correct = 0

# stores the index of the row
count = 0

# calls knn_function and figures out how many knn predictions were correct
for row in validation_data:
    if (knn_function(row, 1) == validation_titles[count]):
        num_correct = num_correct + 1
        count = count + 1

# calculates and prints the percent accuracy
print("k = 25:", num_correct / total_vectors_validation)

k = 1: 0.0008146326467875807
k = 5: 0.0008146326467875807
k = 25: 0.0008146326467875807


Which k gives you the best performance? What is your highest accuracy?

My accuracy is generally pretty low and the k value doesn't really seem to change the results. 

### 1.2 TF-IDF

In [118]:
# implements tf-idf

# calculates the tf value
tf = np.log10(1 + bag_of_words)

# calculates the idf value
idf = np.log(len(data.name.unique())/bag_of_words.sum(axis = 0))

# prints the product of the tf and idf values
print(tf * idf)


[[-0. -0.  0. ...  0.  0.  0.]
 [-0. -0.  0. ...  0.  0.  0.]
 [-0. -0.  0. ...  0.  0.  0.]
 ...
 [-0. -0.  0. ...  0.  0.  0.]
 [-0. -0.  0. ...  0.  0.  0.]
 [-0. -0.  0. ...  0.  0.  0.]]


How accurate is BOW versus TF-IDF? How does choice of k change the results? Is BOW or TF-IDF faster to run?

The BOW is a lot less accurate and a bit slower to run than the TF-IDF. The k value doesn't seem to change the results too much. 