In [9]:
from __future__ import unicode_literals
import numpy as np
import codecs
import re
import operator
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
from sklearn.cluster import DBSCAN
import warnings
warnings.filterwarnings('ignore')

In [10]:
datasets_path = 'C:/Users/OptimusPrime/Desktop/Studia/NLP1/Natural-Language-Processing-1/Week 3/Data'
def load_text():
    with codecs.open(datasets_path + '\\lines.txt', 'r', 'utf-8') as f:
        raw_text = f.read().lower()
        raw_lines = raw_text.split('\n')
        text = re.sub(r"[\\\^\$\.\|\?\*\+\(\)\[\]\{0-9\"!\-_=%<>,'/;:@]", ' ', raw_text) #preprocessing
        text = re.sub(' +', ' ', text)
        text_lines = text.split('\n')
        return np.array(text_lines)
text_lines = load_text()

In [11]:
cv = CountVectorizer(ngram_range=(1,1), analyzer='word')
freq_matrix = cv.fit_transform(text_lines)
words = cv.get_feature_names()
occurence_number = freq_matrix.sum(axis=0).A1

occurence_number, words = (list(t) for t in zip(*sorted(zip(occurence_number, words), reverse=True)))

stop_list = [t[1] for t in zip(occurence_number, words) if t[0] > 110]

filtered_lines = list()
for line in text_lines: 
    line_words = line.split()
    preprocessed_words = [word for word in line_words if word not in stop_list]
    filtered_lines.append(' '.join(preprocessed_words))

In [12]:
print("Stop list:")
print(stop_list)

Stop list:
['ltd', 'tel', 'fax', 'china', 'co', 'road', 'no', 'ul', 'poland', 'logistics', 'sp', 'russia', 'of', 'petersburg', 'moscow', 'st', 'str', 'gdynia', 'building', 'shanghai', 'limited', 'pl', 'office', 'finland', 'rd', 'international', 'ningbo', 'llc', 'oy', 'and', 'ooo', 'city', 'shenzhen', 'street', 'room', 'district', 'as', 'floor', 'th', 'branch', 'forwarding', 'order', 'saint', 'industrial', 'warszawa', 'zhejiang', 'global', 'to', 'rm', 'hong', 'eori', 'kong', 'bldg', 'tower', 'lit', 'agent', 'company', 'polska', 'phone', 'fi', 'shipping', 'air', 'east', 'trade', 'taiwan', 'schenker', 'import', 'xiamen', 'trading', 'plaza', 'on', 'zone', 'attn', 'sea', 'line', 'the', 'box', 'town', 'cargo', 'behalf', 'south', 'inn', 'group', 'qingdao', 'business', 'damco', 'centre', 'park', 'kuehne', 'guangzhou', 'zip', 'vantaa', 'nagel', 'jiangsu', 'code', 'thailand', 'helsinki', 'freight', 'center', 'russian', 'panalpina', 'kotka', 'federation', 'province', 'for', 'com', 'mail', 'export

In [13]:
frequency_matrix = freq_matrix.toarray()
# to calculate distances cosine and dice metrics are used
cosine_similarity_matrix = cosine_similarity(frequency_matrix)
dice_similarity_matrix = pairwise_distances(frequency_matrix, metric='dice', n_jobs=-1)

In [14]:
dice_similarity_matrix[-1][-1] = 1.0 #no idea why but very last cell in matrix is NAN and must be exchanged with 1.0

companies_codes = range(len(dice_similarity_matrix))

# DBSCANN as a clustering algorithm, because it doesn't require number of clusters
dbscn_dice = DBSCAN(eps=0.3, min_samples=0, metric='precomputed', n_jobs=-1)
dbscn_cosine = DBSCAN(eps=0.3, min_samples=0, metric='precomputed', n_jobs=-1)

dbscn_dice.fit(dice_similarity_matrix)
dbscn_cosine.fit(1.0-cosine_similarity_matrix) #1.0-values because cosine metric gives 1.0 for the identical samples

DBSCAN(algorithm='auto', eps=0.3, leaf_size=30, metric='precomputed',
    metric_params=None, min_samples=0, n_jobs=-1, p=None)

In [15]:
# Ground truth labels aren't known, so the silhouette and davies-bouldin metrics are computed (they don't need true labels) 
silhouette_dice = metrics.silhouette_score(dice_similarity_matrix, dbscn_dice.labels_, metric='precomputed')
silhouette_cosine = metrics.silhouette_score(cosine_similarity_matrix, dbscn_cosine.labels_, metric='precomputed')
davies_bouldin_dice = metrics.davies_bouldin_score(frequency_matrix, dbscn_dice.labels_)
davies_bouldin_cosine = metrics.davies_bouldin_score(frequency_matrix, dbscn_cosine.labels_)

In [16]:
print("Silhouette score for dice metric:")
print(silhouette_dice)
print("Silhouette score for cosine metric:")
print(silhouette_cosine)
print("Davies-Bouldin score for dice metric:")
print(davies_bouldin_dice)
print("Davies-Bouldin score for cosine metric:")
print(davies_bouldin_cosine)
#Both metrics indicate that dice metric to calculate distances performs better
#(higher Silhouette score and lower Davies-Bouldin score)

Silhouette score for dice metric:
0.32703129906893974
Silhouette score for cosine metric:
-0.6091528436018957
Davies-Bouldin score for dice metric:
0.7294880941984994
Davies-Bouldin score for cosine metric:
0.7718271731193386
