In [341]:
from sklearn.externals import joblib
import pandas as pd
import numpy as np
import nltk
from nltk import word_tokenize
import os

In [342]:
#load the saved model
trained_model = joblib.load('cluster_classification_model.pkl')

In [343]:
#load the dataFrame
dataFrame = pd.read_csv('document_clustered.csv',sep=',')
dataFrame.head()

Unnamed: 0,category,content,clusters,document
0,0,alfonsin allsuite arnott bonded briefed bulax ...,2,45
1,0,dependent cellulosa allentown bases ergenc err...,23,8858
2,0,cellulosa emphasizing edina celsius dependent ...,23,1242
3,0,edina cellulosa allentown emphasizing celsius ...,23,9002
4,0,downed ecologists edina emphasizing cellulosa ...,23,9026


In [344]:
#load the Dimensional dataFrame
low_dimension_dataFrame = pd.read_csv('../cluster_document_dimensional.csv',sep=',')
low_dimension_dataFrame.head()

Unnamed: 0,Xplots,Yplots,labels,titles
0,106.930675,-0.003941,2,45
1,106.632188,-0.080366,23,8858
2,106.67808,0.023572,23,1242
3,106.662317,-0.13145,23,9002
4,106.674321,-0.104139,23,9026


In [345]:
#preprocess the query
from nltk.corpus import stopwords
def preprocess_the_data(keyword):
    stopset = set(stopwords.words('english'))
    word_tokens = [words for words in nltk.word_tokenize(keyword)]
    filtered_word_tokens = []
    for query_words in word_tokens:
        if query_words not in stopset:
            filtered_word_tokens.append(query_words)
    
    return filtered_word_tokens

In [346]:
def predict_the_cluster_it_belongs_to(keywords_as_list): #keyword_as_list passed as list
    join_keywords = ' '.join(keywords_as_list)
    cluster = trained_model.predict(['r{}'.format(join_keywords)])
    return cluster

1. User Keywords
2. Predict the cluster
3. Find the cosine similarity between the cluster documents
4. Select Top 15 documents

In [347]:
def bring_clustered_documents(cluster):
    as_int = int(cluster)
    documents = dataFrame.loc[dataFrame['clusters'] == as_int]
    return documents

Doing pairwise cosine similarity measures the shape of the linear_kernel(X,Y) X and Y should be same
Using PCA Decomposition we decompose the matrix

In [348]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.decomposition import TruncatedSVD

def find_cosine_similarites(user_keyword,cluster_dataFrame):
    #convert the data into Series
    df_content = cluster_dataFrame['content']
    content_list = []
    for contents in df_content:
        content_list.append(contents)
    tfidf = TfidfVectorizer(max_df=1.0,max_features=400,use_idf=True,ngram_range=(1,5))
    tfidf_vect = tfidf.fit_transform(content_list) #matrix shape is (n_documents,400)
    #tfidf for user query
    user_query_tfidf = tfidf.fit_transform(['r{}'.format(user_keyword)]) #matrix shape is (n_words,n_words)
    n_dimension = user_query_tfidf.shape[1]
    #TruncateSVD
    svd = TruncatedSVD(n_components=n_dimension)
    svd_vect = svd.fit_transform(tfidf_vect) #matrix shape is (n_documents,n_dimension) here n_dimension isequals n_words
    similarity = linear_kernel(user_query_tfidf,svd_vect).flatten()
    return similarity

In [349]:
#convert the document_related_to_keyword dataframe into list
document_list_to_append= []

def convert_into_list(document_dataFrame): #shape (?,4)
    for index,row in document_dataFrame.iterrows():
        document_list_to_append.append(row['document'])
    
    return document_list_to_append

In [353]:
user_keyword = input("Enter the keyword to search:")
preprocessed_user_keyword = preprocess_the_data(user_keyword) #user query in tokenized form
cluster_it_belongs_to = predict_the_cluster_it_belongs_to(preprocessed_user_keyword) #return the cluster it belongs to
documents_related_to_keyword = bring_clustered_documents(cluster_it_belongs_to) #return as dataFrame
documents_list = convert_into_list(documents_related_to_keyword) #convert into list
#find the cosine similarity between each document
cosine_simi_documents = find_cosine_similarites(user_keyword,documents_related_to_keyword)
#make sure the length of cosine_simi_documents and documents_list are same 
documents_and_its_similarity = dict(zip(documents_list,cosine_simi_documents))
print(cluster_it_belongs_to)

Enter the keyword to search:commence crisanti dylex esex amroas 
[6]


In [354]:
#pick top 15 documents from the dictionary
#using Counter to pick top values TimeComplexity will be O(nlogk) n elements K values
from collections import Counter
d = Counter(documents_and_its_similarity)
document_to_show = []
for docs,similarity in d.most_common(15):
    document_to_show.append(docs)

In [355]:
document_to_show

['0007145',
 '0007378',
 '0000012',
 '0008209',
 '0007159',
 '0004111',
 '0006020',
 '0001384',
 '0003977',
 '0006067',
 '0002251',
 '0002369',
 '0004563',
 '0001663',
 '0005378']