In [None]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install pandarallel
!pip install datasketch
print("INSTALLATIONS COMPLETE.")

## Imports

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,roc_auc_score,roc_curve
from matplotlib import pyplot as plt
import seaborn as sns
import re
import nltk
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords
from wordcloud import WordCloud
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from pandarallel import pandarallel
from tqdm.notebook import tqdm
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import cross_validate, KFold, cross_val_score, StratifiedKFold , cross_val_predict
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import make_scorer, precision_score, recall_score, accuracy_score
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import xgboost as xgb
import optuna
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import NearestNeighbors, VALID_METRICS_SPARSE
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import jaccard_score
from scipy import sparse
from sklearn.preprocessing import normalize, binarize

import subprocess

# Download and unzip wordnet
try:
    nltk.data.find('wordnet.zip')
except:
    nltk.download('wordnet', download_dir='/kaggle/working/')
    command = "unzip /kaggle/working/corpora/wordnet.zip -d /kaggle/working/corpora"
    subprocess.run(command.split())
    nltk.data.path.append('/kaggle/working/')
    

print('Imports done.')

In [None]:
train_df = pd.read_csv('/kaggle/input/bigdata2023classification/train.csv')
test_df = pd.read_csv('/kaggle/input/bigdata2023classification/test_without_labels.csv')

train_df['Label_id'] = train_df['Label'].factorize()[0] # create specific ID for each label. Helps later on with wordclouds and XGBoost.

train_df.head()

# Question 1.1

## Data Preprocessing

In [None]:
# Visualize our data (specifically the Label column)
# Data Distribution

train_df.groupby('Label').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
# Countplot for column Label, there seem to be more entertainment articles.
sns.countplot(x='Label',data=train_df)

In [None]:
# Check which ID corresponds to which label
label = train_df[['Label', 'Label_id']].drop_duplicates().sort_values('Label_id')
label

In [None]:
# Checking shapes
print('train_df size: ', train_df.shape)
print('test_df size: ', test_df.shape)
# 159707 full size

# Question 1.2

In [None]:
# Text preprocessing for Content column
# Lower all text

train_df['Content'] = train_df['Content'].str.lower()
test_df['Content'] = test_df['Content'].str.lower()

# Initialize pandarallel
# I used pandarallel because it applies the functions much faster than a normal pandas apply.
pandarallel.initialize(nb_workers=8,progress_bar=True)

# Remove all special characters
def remove_special_chars(text):
    return ''.join(x if x.isalnum() else ' ' for x in text)

train_df['Content'] = train_df['Content'].parallel_apply(remove_special_chars)
test_df['Content'] = test_df['Content'].parallel_apply(remove_special_chars)

# get stopwords.
stop = set(stopwords.words('english'))
extra_stopwords = {'well', 'said', 'say', 'one', 'even'}
stop.update(extra_stopwords)

# Remove stop_words
def remove_stopwords(text):
    words = word_tokenize(text)
    return [x for x in words if x not in stop]

train_df['Content'] = train_df['Content'].parallel_apply(remove_stopwords)
test_df['Content'] = test_df['Content'].parallel_apply(remove_stopwords)

# Lemmatization
def lemmatize_word(text):
    wordnet = WordNetLemmatizer()
    return " ".join([wordnet.lemmatize(word) for word in text])

train_df['Content'] = train_df['Content'].parallel_apply(lemmatize_word)
test_df['Content'] = test_df['Content'].parallel_apply(lemmatize_word)



print('Example of preprocessing train: ')
print(train_df['Content'][0])
print("\n")
print('Example of preprocessing test: ')
print(test_df['Content'][0])

In [None]:
# Word cloud creation

# get each label's articles to create word cloud for each label
entertainment = train_df[train_df['Label_id'] == 0]
entertainment = entertainment['Content']

technology = train_df[train_df['Label_id'] == 1]
technology = technology['Content']

business = train_df[train_df['Label_id'] == 2]
business = business['Content']

health = train_df[train_df['Label_id'] == 3]
health = health['Content']

def wordcloud_draw(dataset, color = 'white'):
    words = ' '.join(dataset)
    cleaned_word = ' '.join([word for word in words.split()
    if (word != 'news' and word != 'text')])
    wordcloud = WordCloud(stopwords = stop,
    background_color = color,
    width = 2500, height = 2500).generate(cleaned_word)
    plt.figure(1, figsize = (10,7))
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()
    
print('Entertainment wordcloud: ')
wordcloud_draw(entertainment)

print('Technology wordcloud: ')
wordcloud_draw(technology)

print('Business wordcloud: ')
wordcloud_draw(business)

print('Health wordcloud: ')
wordcloud_draw(health)

## Bag of words 

In [None]:
# Count vectorizer will represent the BOW model.

X_train = train_df['Content']
X_test = test_df['Content']
y_train = train_df['Label']

count_vectorizer = CountVectorizer()

X_train = count_vectorizer.fit_transform(train_df['Content'])
X_test = count_vectorizer.transform(test_df['Content'])

print('Preprocessing complete.')

## SVM training and evaluation (with BOW)

In [None]:
# Support Vector Machine (SVM)
# using LinearSVC because it's faster for large datasets , that can be separated linearly 
# and supports One-vs-Rest technique (Great for multi-class labels like our case). 

svm_model = LinearSVC(random_state = 42, max_iter=1000)

# Perform 5-fold cross-validation and get predictions for each fold
y_pred = cross_val_predict(svm_model, X_train, y_train, cv=5)

# Print classification report for each category
print("====================== SVM Classification Report (BOW) ======================")
print("\n")
print(classification_report(y_train, y_pred))

## Random Forest training and evaluation (BOW)

In [None]:
# Creating a random forest classifier with 20 Decision trees. Took a small number of trees here because it's really slow with BOW.
rf_model = RandomForestClassifier(n_estimators=20,n_jobs=-1)

# Perform 5-fold cross-validation and get predictions for each fold
y_pred = cross_val_predict(rf_model, X_train, y_train, cv=5)

# Print classification report for each category
print("====================== Random Forest Classification Report (BOW) ======================")
print("\n")
print(classification_report(y_train, y_pred))

## SVD 

In [None]:
# Perform SVD

svd = TruncatedSVD(n_components=100)
X_train_reduced = svd.fit_transform(X_train)
X_test_reduced = svd.transform(X_test)
print("SVD complete.")

## SVM Training and evaluation (SVD)

In [None]:
# Support Vector Machine (SVM)
# using LinearSVC because it's faster for large datasets , that can be separated linearly 
# and supports One-vs-Rest technique (Great for multi-class labels like our case). 

svm_model = LinearSVC(random_state = 42, max_iter=1000)

# Perform 5-fold cross-validation and get predictions for each fold
y_pred = cross_val_predict(svm_model, X_train_reduced, y_train, cv=5)

# Print classification report for each category
print("====================== SVM Classification Report (SVD) ======================")
print("\n")
print(classification_report(y_train, y_pred))

## Random Forest training and evaluation (SVD)

In [None]:
# Creating a random forest classifier with 100 Decision trees (SVD cross validation is faster than BOW for Random Forest).
rf_model = RandomForestClassifier(n_estimators=100,n_jobs=-1)

# Perform 5-fold cross-validation and get predictions for each fold
y_pred = cross_val_predict(rf_model, X_train_reduced, y_train, cv=5)

# Print classification report for each category
print("====================== Random Forest Classification Report (SVD) ======================")
print("\n")
print(classification_report(y_train, y_pred))

## Beat the benchmark algorithm

In [None]:
# Title added as feature with content (combined), increases accuracy a little bit (0.3% increase).
train_df['Title'] = train_df['Title'].str.lower()
test_df['Title'] = test_df['Title'].str.lower()

pandarallel.initialize(nb_workers=8,progress_bar=True)

train_df['Title'] = train_df['Title'].parallel_apply(remove_special_chars)
test_df['Title'] = test_df['Title'].parallel_apply(remove_special_chars)

train_df['Title'] = train_df['Title'].parallel_apply(remove_stopwords)
test_df['Title'] = test_df['Title'].parallel_apply(remove_stopwords)

train_df['Title'] = train_df['Title'].parallel_apply(lemmatize_word)
test_df['Title'] = test_df['Title'].parallel_apply(lemmatize_word)

train_df['Combined'] = train_df['Title'] + ' ' + train_df['Content']
test_df['Combined'] = test_df['Title'] + ' ' + test_df['Content']

print('Example of preprocessing train: ')
print(train_df['Combined'][0])
print("\n")
print('Example of preprocessing test: ')
print(test_df['Combined'][0])

In [None]:
X_train = train_df['Combined']
X_test = test_df['Combined']
y_train = train_df['Label']

# Using hashing vectorizer as it's a better form of the classic BOW count vectorizer,
# giving me the best possible results.

hashing_vectorizer = HashingVectorizer()
X_train = hashing_vectorizer.fit_transform(train_df['Combined'])
X_test = hashing_vectorizer.transform(test_df['Combined'])

print('Preprocessing complete.')

In [None]:
# Logistic Regression model (BOW) , testing this model but it gave me worse results than LinearSVC with hashing vectorizer.
# Using sag solver since it finishes the fastest out of all solvers (along with saga, 2 minutes)

lr_model = LogisticRegression(tol=1e-4, C=1.0, n_jobs=-1, solver='sag',random_state=42)

# Perform 5-fold cross-validation and get predictions for each fold
y_pred = cross_val_predict(lr_model, X_train, y_train, cv=5)

# Print classification report for each category
print("====================== Logistic Regression Classification Report ======================")
print("\n")
print(classification_report(y_train, y_pred))

In [None]:
# XGBoost model
xgb_model = xgb.XGBClassifier(learning_rate=0.1, n_estimators=20, max_depth=3, random_state=42)

y_label_ids = train_df['Label_id']

# Perform 5-fold cross-validation and get predictions for each fold
y_pred = cross_val_predict(xgb_model, X_train, y_label_ids, cv=5)

# Print classification report for each category
print("====================== XGBoost Classification Report ======================")
print("\n")
print(classification_report(y_label_ids, y_pred))

In [None]:
# Support Vector Machine (SVM)
# using LinearSVC because it's faster for large datasets , that can be separated linearly 
# and supports One-vs-Rest technique (Great for multi-class labels like our case). 

# Best model with 97% accuracy

svm_model = LinearSVC(random_state=42, max_iter=1000)

# Perform 5-fold cross-validation and get predictions for each fold
y_pred = cross_val_predict(svm_model, X_train, y_train, cv=5)

# Print classification report for each category
print("====================== SVM Classification Report (Best model) ======================")
print("\n")
print(classification_report(y_train, y_pred))

## Obtain predictions

In [None]:
svm_model.fit(X_train,y_train)

y_pred_test = svm_model.predict(X_test)

print('Training complete.')

## Output file

In [None]:
import csv

with open('testSet_categories.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    
    writer.writerow(["Id","Predicted"])
    for i in range(0,47912):
        writer.writerow([test_df['Id'][i],y_pred_test[i]])  
    
    
print("CSV file writing complete.")

# Question 2

## Brute force k-NN

In [None]:
import time
import warnings
from sklearn.exceptions import DataConversionWarning

# turn hash-vectorized sparse matrices to SVD arrays in order to get faster computation.
svd = TruncatedSVD(n_components=100)
X_train_reduced = svd.fit_transform(X_train)
X_test_reduced = svd.transform(X_test)

# Make SVD arrays to binary format for proper jaccard computation.
X_train_binary = binarize(X_train_reduced)
X_test_binary = binarize(X_test_reduced)

print("x_train_binary shape: ", X_train_binary.shape)

print('svd done.')

start_time = time.time()
nbrs = NearestNeighbors(n_neighbors=15, algorithm='brute', metric='jaccard', n_jobs=-1).fit(X_train_binary)
build_time = time.time() - start_time
print(f"Build time: {build_time} seconds")

print('nbrs done.')

# Suppress DataConversionWarning for this block , to save some room
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=DataConversionWarning)

    start_time = time.time()
    distances, indices = nbrs.kneighbors(X_test_binary)
    query_time = time.time() - start_time
    print(f"Query time: {query_time} seconds")


print('k-nn done.')

In [None]:
# Checking NearestNeighbors validity (if what is found is correct)
# Both articles seem to be correct, since they're business related.
print("Source Example:")
print(test_df.iloc[1]['Combined'])

print('\n')
print("Neighbor:")
print(train_df.iloc[indices[1][0]]['Combined'])

print('\n')
print(indices[1])

## Min-hash LSH

In [None]:
# Tokenize and create sets of tokens

pandarallel.initialize(nb_workers=8,progress_bar=True)

def tokenize_text(text):
    return set(word_tokenize(text))

# Tokenize and create sets of tokens for test_df and train_df
train_df['Tokenized'] = train_df['Combined'].parallel_apply(lambda x: tokenize_text(x))
test_df['Tokenized'] = test_df['Combined'].parallel_apply(lambda x: tokenize_text(x))

# Now train_df['Tokenized'] contains sets of tokens for each article
print(train_df['Tokenized'][0])

In [None]:
# Min Hash LSH

from datasketch import MinHash, MinHashLSH

minhash_objects = []

def create_minhash(tokens):
    minhash = MinHash(num_perm=16)
    for token in tokens:
        minhash.update(token.encode('utf8'))
    return minhash
    

start_time = time.time()


minhash_objects = list(map(create_minhash, train_df['Tokenized']))

# Create MinHashLSH Index
# lsh = MinHashLSH(threshold=0.8, num_perm=16)

# for i, minhash in enumerate(minhash_objects):
#     lsh.insert(i, minhash)


test_minhash_objects = []
# Create MinHash objects for test_df
test_minhash_objects = list(map(create_minhash, test_df['Tokenized']))


# Query MinHashLSH for candidates in train_df


# Create MinHashLSH Index
lsh = MinHashLSH(threshold=0.8, num_perm=16)

for i, minhash in enumerate(minhash_objects):
    lsh.insert(i, minhash)
    
build_time = time.time() - start_time
print(f"Build time (lsh with 16 permutations, threshold >= 0.8): {build_time} seconds")
# test_candidates = []
# for i, minhash in enumerate(test_minhash_objects):
#     result = lsh.query(minhash)
#     test_candidates.append(result)
start_time = time.time()
test_candidates = [lsh.query(minhash) for minhash in test_minhash_objects]

query_time = time.time() - start_time
print(f"Query time (lsh with 16 permutations, threshold >= 0.8): {query_time} seconds")

# num_neighbors = 15

# neighbors = []
# for i, minhash in enumerate(minhash_objects):
#     result = lsh.query(minhash)
# #     print(f"Neighbors for document {i + 1} ({df['document_id'][i]}): {result}")
#     neighbors.append(result)

print('min hash done.')

In [None]:
# test_candidates_flat = test_candidates.flatten(), jic -> / 1000
test_candidates_flat = [item for sublist in test_candidates for item in sublist]
print(len(test_candidates_flat))
indices_flat = indices.flatten()
print("Fraction of true k-most similar docs found by LSH (16 perms): ", len(set(test_candidates_flat).intersection(indices_flat)) / 15) # find fraction for K=15 

In [None]:
# 32 permutations minhash LSH
minhash_objects_32 = []

def create_minhash(tokens):
    minhash = MinHash(num_perm=32)
    for token in tokens:
        minhash.update(token.encode('utf8'))
    return minhash

start_time = time.time()
minhash_objects_32 = list(map(create_minhash, train_df['Tokenized']))

# Create MinHashLSH Index
lsh2 = MinHashLSH(threshold=0.8, num_perm=32)

for i, minhash in enumerate(minhash_objects_32):
    lsh2.insert(i, minhash)

build_time = time.time() - start_time
print(f"Build time (lsh with 32 permutations, threshold >= 0.8): {build_time} seconds")


start_time = time.time()
# Create MinHash objects for test_df
test_minhash_objects_32 = list(map(create_minhash, test_df['Tokenized']))

# Query MinHashLSH for candidates in train_df
test_candidates_32 = [lsh2.query(minhash) for minhash in test_minhash_objects_32]

dups_dict_32 = {}

for i,minhash in enumerate(test_candidates_32):
    dups_dict_32[i] = minhash

query_time = time.time() - start_time
print(f"Query time (lsh with 32 permutations, threshold >= 0.8): {query_time} seconds")

max_length = len(max(test_candidates_32, key=len))
print("length of list of neighbors with most neighbors (32 permutations minhash LSH):", max_length)

sum_of_lengths = sum(len(sublist) for sublist in test_candidates_32)
print("Sum of neighbors (32 permutations): ", sum_of_lengths)

In [None]:
# test_candidates_flat = test_candidates.flatten(), , jic -> / 1000
test_candidates_flat_32 = [item for sublist in test_candidates_32 for item in sublist]
print(len(test_candidates_flat_32))
indices_flat = indices.flatten()
print("Fraction of true k-most similar docs found by LSH (32 perms): ", len(set(test_candidates_flat_32).intersection(indices_flat)) / 15) # find fraction for K=15

In [None]:
# 64 permutations minhash LSH
minhash_objects_64 = []

def create_minhash(tokens):
    minhash = MinHash(num_perm=64)
    for token in tokens:
        minhash.update(token.encode('utf8'))
    return minhash

start_time = time.time()

minhash_objects_64 = list(map(create_minhash, train_df['Tokenized']))

# Create MinHashLSH Index
lsh3 = MinHashLSH(threshold=0.8, num_perm=64)

for i, minhash in enumerate(minhash_objects_64):
    lsh3.insert(i, minhash)

build_time = time.time() - start_time
print(f"Build time (lsh with 64 permutations, threshold >= 0.8): {build_time} seconds")

start_time = time.time()
# Create MinHash objects for test_df
test_minhash_objects_64 = list(map(create_minhash, test_df['Tokenized']))

# Query MinHashLSH for candidates in train_df
test_candidates_64 = [lsh3.query(minhash) for minhash in test_minhash_objects_64]

query_time = time.time() - start_time
print(f"Query time (lsh with 64 permutations, threshold >= 0.8): {query_time} seconds")


max_length = len(max(test_candidates_64, key=len))
print("length of list of neighbors with most neighbors (64 permutations minhash LSH):", max_length)
# print(test_candidates)
sum_of_lengths = sum(len(sublist) for sublist in test_candidates_64)
print("Sum of neighbors (64 permutations): ", sum_of_lengths)

In [None]:
# jic -> / 1000
test_candidates_flat_64 = [item for sublist in test_candidates_64 for item in sublist]
print(len(test_candidates_flat_64))
indices_flat = indices.flatten()
print("Fraction of true k-most similar docs found by LSH (64 perms): ", len(set(test_candidates_flat_64).intersection(indices_flat)) / 15) # find fraction for K=15

In [None]:
# 16 permutations minhash LSH with >= 0.5 threshold
minhash_objects = []

def create_minhash(tokens):
    minhash = MinHash(num_perm=16)
    for token in tokens:
        minhash.update(token.encode('utf8'))
    return minhash

start_time = time.time()
minhash_objects = list(map(create_minhash, train_df['Tokenized']))

# Create MinHashLSH Index
lsh = MinHashLSH(threshold=0.5, num_perm=16)

for i, minhash in enumerate(minhash_objects):
    lsh.insert(i, minhash)

build_time = time.time() - start_time
print(f"Build time (lsh with 16 permutations, threshold >= 0.5): {build_time} seconds")

# Create MinHash objects for test_df
start_time = time.time()
test_minhash_objects = list(map(create_minhash, test_df['Tokenized']))

# Query MinHashLSH for candidates in train_df
    
test_candidates = [lsh.query(minhash) for minhash in test_minhash_objects]    

query_time = time.time() - start_time
print(f"Query time (lsh with 16 permutations, threshold >= 0.5): {query_time} seconds")

# print(test_candidates)
print('min hash done.')

In [None]:
# , jic -> / 100
test_candidates_flat = [item for sublist in test_candidates for item in sublist]
print(len(test_candidates_flat))
indices_flat = indices.flatten()
print("Fraction of true k-most similar docs found by LSH (16 perms, >= 0.5 threshold): ", len(set(test_candidates_flat).intersection(indices_flat)) / 15) # find fraction for K=15

In [None]:
# 32 permutations minhash LSH with >= 0.5 threshold
minhash_objects_32 = []

def create_minhash(tokens):
    minhash = MinHash(num_perm=32)
    for token in tokens:
        minhash.update(token.encode('utf8'))
    return minhash

start_time = time.time()
minhash_objects_32 = list(map(create_minhash, train_df['Tokenized']))

# Create MinHashLSH Index
lsh2 = MinHashLSH(threshold=0.5, num_perm=32)

for i, minhash in enumerate(minhash_objects_32):
    lsh2.insert(i, minhash)

build_time = time.time() - start_time
print(f"Build time (lsh with 32 permutations, threshold >= 0.5): {build_time} seconds")

# Create MinHash objects for test_df
start_time = time.time()
test_minhash_objects_32 = list(map(create_minhash, test_df['Tokenized']))

# Query MinHashLSH for candidates in train_df
    
test_candidates_32 = [lsh2.query(minhash) for minhash in test_minhash_objects_32]    

query_time = time.time() - start_time
print(f"Query time (lsh with 32 permutations, threshold >= 0.5): {query_time} seconds")

# print(test_candidates)
print('min hash done.')

In [None]:
# , jic -> / 100
test_candidates_flat_32 = [item for sublist in test_candidates_32 for item in sublist]
print(len(test_candidates_flat_32))
indices_flat = indices.flatten()
print("Fraction of true k-most similar docs found by LSH (32 perms, >= 0.5 threshold): ", len(set(test_candidates_flat_32).intersection(indices_flat)) / 15) # find fraction for K=15

In [None]:
# 64 permutations minhash LSH with >= 0.5 threshold
minhash_objects_64 = []

def create_minhash(tokens):
    minhash = MinHash(num_perm=64)
    for token in tokens:
        minhash.update(token.encode('utf8'))
    return minhash

start_time = time.time()
minhash_objects_64 = list(map(create_minhash, train_df['Tokenized']))

# Create MinHashLSH Index
lsh3 = MinHashLSH(threshold=0.5, num_perm=64)

for i, minhash in enumerate(minhash_objects_64):
    lsh3.insert(i, minhash)

build_time = time.time() - start_time
print(f"Build time (lsh with 64 permutations, threshold >= 0.5): {build_time} seconds")

# Create MinHash objects for test_df
start_time = time.time()
test_minhash_objects_64 = list(map(create_minhash, test_df['Tokenized']))

# Query MinHashLSH for candidates in train_df
    
test_candidates_64 = [lsh3.query(minhash) for minhash in test_minhash_objects_64]    

query_time = time.time() - start_time
print(f"Query time (lsh with 64 permutations, threshold >= 0.5): {query_time} seconds")

# print(test_candidates)
print('min hash done.')

In [None]:
# jic -> / 100
test_candidates_flat_64 = [item for sublist in test_candidates_64 for item in sublist]
print(len(test_candidates_flat_64))
indices_flat = indices.flatten()
print("Fraction of true k-most similar docs found by LSH (64 perms, >= 0.5 threshold): ", len(set(test_candidates_flat_64).intersection(indices_flat)) / 15) # find fraction for K=15