In [37]:
# 1. Cluster into 10 clusters
# 2. Calculate the centroid of each cluster
# 3. Use centroid of each cluster to create a 10 sentence summary
# 4. Explain choice of clustering and similarity

# Notes
# The data set contains several sentences that contain just 1 punctuation character, '?'
# We will ignore these sentences to improve clustering algorithm

import re
import numpy
from os import listdir
from os.path import join, abspath
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from modules.TextPreProcessor import removeShortDocs
from modules.TextPreProcessor import removeStopWords
from modules.TextPreProcessor import stemSentences

# define data set and parameters
raw_data = open('/home/iftekhar/AI-system/'
                'Summarization Models/Translated_text_mobicontrol_Page01_JP_to_En.txt', 'r').read()
ps = PorterStemmer()
nltk_stop_words = set(stopwords.words('english'))
cluster_count = 3


####################################
# PRE-PROCESSING
####################################

# split document into sentences and strip whitespace (delimeted by line)
sentences = raw_data.split('.')
sentences = map(lambda sentence: sentence.strip(), sentences)
sentences = list(sentences)

min_sentence_length = 30

# remove sentences that do not contribute meaning by assuming short sentences have less meaning
sentences = removeShortDocs(sentences, min_sentence_length)
sentences = list(sentences)

# remove stop words from all sentences
processedSentences = removeStopWords(sentences, nltk_stop_words)
#processedSentences

sentences = list(sentences)

# stem all tokens of all sentences
processedSentences = stemSentences(sentences, ps)
#processedSentences



####################################
# Apply K Means Clustering
####################################
	
# create tfidf matrix from the processed sentences
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(processedSentences)

# cluster our tokenized sentences into 10 groups
kMeansCluster = KMeans(n_clusters=cluster_count)
kMeansCluster.fit(tfidf_matrix)
clusters = kMeansCluster.labels_.tolist()

In [None]:
sentences

In [None]:
processedSentences

In [None]:
clusters

In [38]:
####################################
# Organize Cluster Results
####################################

# Create new dictionary that tracks which cluster each sentence belongs to
# keeps copy of original sentences and stemmed sentences
# sentenceDictionary { idx: { text: String, stemmed: String, cluster: Number } }
sentenceDictionary = {}
for idx, sentence in enumerate(sentences):
	sentenceDictionary[idx] = {}
	sentenceDictionary[idx]['text'] = sentence
	sentenceDictionary[idx]['cluster'] = clusters[idx]
	sentenceDictionary[idx]['stemmed'] = processedSentences[idx]

sentenceDictionary

{0: {'text': 'The meaning of health has evolved over time',
  'cluster': 1,
  'stemmed': 'the mean of health ha evolv over time'},
 1: {'text': "In keeping with the biomedical perspective, early definitions of health focused on the theme of the body's ability to function; health was seen as a state of normal function that could be disrupted from time to time by disease",
  'cluster': 1,
  'stemmed': "In keep with the biomed perspect , earli definit of health focus on the theme of the bodi 's abil to function ; health wa seen as a state of normal function that could be disrupt from time to time by diseas"},
 2: {'text': 'An example of such a definition of health is: "a state characterized by anatomic, physiologic, and psychological integrity; ability to perform personally valued family, work, and community roles; ability to deal with physical, biological, psychological, and social stress"',
  'cluster': 0,
  'stemmed': "An exampl of such a definit of health is : `` a state character by 

In [39]:
# Create new dictionary that contains 1 entry for each cluster
# each key in dictionary will point to array of sentences, all of which belong to that cluster
# we attach the index to the sentenceDictionary object so we can recall the original sentence
clusterDictionary = {}
for key, sentence in sentenceDictionary.items():
	if sentence['cluster'] not in clusterDictionary:
		clusterDictionary[sentence['cluster']] = []
	clusterDictionary[sentence['cluster']].append(sentence['stemmed'])
	sentence['idx'] = len(clusterDictionary[sentence['cluster']]) - 1
clusterDictionary

{1: ['the mean of health ha evolv over time',
  "In keep with the biomed perspect , earli definit of health focus on the theme of the bodi 's abil to function ; health wa seen as a state of normal function that could be disrupt from time to time by diseas",
  "then in 1948 , in a radic departur from previou definit , the world health organ ( who ) propos a definit that aim higher : link health to well-b , in term of `` physic , mental , and social well-b , and not mere the absenc of diseas and infirm ''",
  'for a long time , it wa set asid as an impract ideal and most discuss of health return to the practic of the biomed model',
  'again , the who play a lead role when it foster the develop of the health promot movement in the 1980',
  '1984 who revis the definit of health defin it as `` the extent to which an individu or group is abl to realiz aspir and satisfi need and to chang or cope with the environ',
  'sinc the late 1970 , the feder healthi peopl initi ha been a visibl compon o

In [None]:
####################################
# Calculate Cosine Similarity Scores
####################################		

# For each cluster of sentences,
# Find the sentence with highet cosine similarity over all sentences in cluster
maxCosineScores = {}
for key, clusterSentences in clusterDictionary.items():
	maxCosineScores[key] = {}
	maxCosineScores[key]['score'] = 0
	tfidf_matrix = vectorizer.fit_transform(clusterSentences)
	cos_sim_matrix = cosine_similarity(tfidf_matrix)
	
	for idx, row in enumerate(cos_sim_matrix):
		sum = 0
		for col in row:
			sum += col
		if sum > maxCosineScores[key]['score']:
			maxCosineScores[key]['score'] = sum
			maxCosineScores[key]['idx'] = idx

maxCosineScores

In [46]:
maxCosineScores

{1: {'score': 3.775852511523733, 'idx': 15},
 0: {'score': 1.7668515260010889, 'idx': 2},
 2: {'score': 1.939703440324808, 'idx': 2}}

In [45]:
for idx, row in enumerate(cos_sim_matrix):
    print(idx, row)

0 [1.         0.23451747 0.11366216 0.12115711 0.13604642 0.05771266]
1 [0.23451747 1.         0.28956494 0.17495927 0.06941952 0.09586532]
2 [0.11366216 0.28956494 1.         0.28939366 0.09397814 0.15310455]
3 [0.12115711 0.17495927 0.28939366 1.         0.07940973 0.07374433]
4 [0.13604642 0.06941952 0.09397814 0.07940973 1.         0.02849438]
5 [0.05771266 0.09586532 0.15310455 0.07374433 0.02849438 1.        ]


In [70]:
b = "1,0.23451747,0.11366216,0.12115711,0.13604642,0.05771266,0.23451747,1,0.28956494,0.17495927,0.06941952,0.09586532,0.11366216,0.28956494,1,0.28939366,0.09397814,0.15310455,0.12115711,0.17495927,0.28939366,1,0.07940973,0.07374433,0.13604642,0.06941952,0.09397814,0.07940973,1,0.02849438,0.05771266,0.09586532,0.15310455,0.07374433,0.02849438,1"
b = b.split(",")
b 

['1',
 '0.23451747',
 '0.11366216',
 '0.12115711',
 '0.13604642',
 '0.05771266',
 '0.23451747',
 '1',
 '0.28956494',
 '0.17495927',
 '0.06941952',
 '0.09586532',
 '0.11366216',
 '0.28956494',
 '1',
 '0.28939366',
 '0.09397814',
 '0.15310455',
 '0.12115711',
 '0.17495927',
 '0.28939366',
 '1',
 '0.07940973',
 '0.07374433',
 '0.13604642',
 '0.06941952',
 '0.09397814',
 '0.07940973',
 '1',
 '0.02849438',
 '0.05771266',
 '0.09586532',
 '0.15310455',
 '0.07374433',
 '0.02849438',
 '1',
 '']

In [66]:
sum = 0
for i in :
    float(i)

ValueError: could not convert string to float: 

In [8]:
####################################
# Construct Document Summary
####################################	

# for every cluster's max cosine score,
# find the corresponding original sentence
resultIndices = []
i = 0
for key, value in maxCosineScores.items():
	cluster = key
	idx = value['idx']
	stemmedSentence = clusterDictionary[cluster][idx]
	# key corresponds to the sentences index of the original document
	# we will use this key to sort our results in order of original document
	for key, value in sentenceDictionary.items():
		if value['cluster'] == cluster and value['idx'] == idx:
			resultIndices.append(key)

resultIndices.sort()

# Iterate over sentences and construct summary output
result = ''
for idx in resultIndices:
	result += sentences[idx] + ' '
		

print(result)





There are some basic common threads, however, and the overarching theme is best summed up by this oft-quoted statement made by Arthur Samuel way back in 1959: “[Machine Learning is the] field of study that gives computers the ability to learn without being explicitly programmed ”
“A computer program is said to learn from experience E with respect to some task T and some performance measure P, if its performance on T, as measured by P, improves with experience E All of these problems are excellent targets for an ML project, and in fact ML has been applied to each of them with great success 


In [7]:
vectorizer.get_feature_names()

['all',
 'an',
 'and',
 'appli',
 'are',
 'been',
 'each',
 'excel',
 'fact',
 'for',
 'great',
 'ha',
 'in',
 'ml',
 'of',
 'problem',
 'project',
 'success',
 'target',
 'them',
 'these',
 'to',
 'with']

In [9]:
import pandas as pd
centroids = pd.DataFrame(kMeansCluster.cluster_centers_)
centroids

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,158,159,160,161,162,163,164,165,166,167
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.226172,0.226172,0.106015,...,0.0,0.0,0.0,0.0,0.0,0.13095,0.0,0.0,0.0,0.0
1,0.032392,0.0,0.032392,0.0,0.054512,0.04062,0.034485,0.0,0.0,0.083011,...,0.0,0.13067,0.022843,0.022843,0.063534,0.013226,0.032392,0.040829,0.038587,0.0
2,0.0,0.054158,0.0,0.042043,0.0,0.0,0.03551,0.0,0.0,0.106254,...,0.054158,0.0,0.0,0.0,0.030875,0.175433,0.0,0.0,0.07102,0.042043


In [24]:
from scipy.sparse import csr_matrix

def save_sparse_csr(filename,array):
    np.savez(filename,data = array.data ,indices=array.indices,
             indptr =array.indptr, shape=array.shape )

def load_sparse_csr(filename):
    loader = np.load(filename)
    return csr_matrix((loader['data'], loader['indices'], loader['indptr']),
                         shape = loader['shape'])

save_sparse_csr('laws_tf_idf.npz', tfidf_matrix)

In [25]:
from sklearn.neighbors import NearestNeighbors

model_tf_idf = NearestNeighbors(metric='cosine', algorithm='brute')
model_tf_idf.fit(tfidf_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [None]:
def print_nearest_neighbors(query_tf_idf, full_bill_dictionary, knn_model, k):
    """
    Inputs: a query tf_idf vector, the dictionary of bills, the knn model, and the number of neighbors
    Prints the k nearest neighbors
    """
    distances, indices = knn_model.kneighbors(query_tf_idf, n_neighbors = k+1)
    nearest_neighbors = [full_bill_dictionary.keys()[x] for x in indices.flatten()]
    
    for bill in xrange(len(nearest_neighbors)):
        if bill == 0:
            print 'Query Law: {0}\n'.format(nearest_neighbors[bill])
        else:
            print '{0}: {1}\n'.format(bill, nearest_neighbors[bill])

In [None]:
bill_id = np.random.choice(tfidf_matrix.shape[0])
print_nearest_neighbors(tfidf_matrix[bill_id], clean_bills_dictionary, model_tf_idf, k=5)