In [1]:
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import Counter
from nltk.corpus import stopwords
from sklearn.cluster import KMeans, DBSCAN
import numpy as np
from functools import reduce
from sklearn.decomposition import TruncatedSVD
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import LatentDirichletAllocation
import nltk
from nltk.collocations import *

# Загрузим датасет

In [2]:
df = pd.read_csv("../../data/Emails.csv")

# Посмотрим на него

In [3]:
df.head()

Unnamed: 0,Id,DocNumber,MetadataSubject,MetadataTo,MetadataFrom,SenderPersonId,MetadataDateSent,MetadataDateReleased,MetadataPdfLink,MetadataCaseNumber,...,ExtractedTo,ExtractedFrom,ExtractedCc,ExtractedDateSent,ExtractedCaseNumber,ExtractedDocNumber,ExtractedDateReleased,ExtractedReleaseInPartOrFull,ExtractedBodyText,RawText
0,1,C05739545,WOW,H,"Sullivan, Jacob J",87.0,2012-09-12T04:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH2/DOC_0C05739545...,F-2015-04841,...,,"Sullivan, Jacob J <Sullivan11@state.gov>",,"Wednesday, September 12, 2012 10:16 AM",F-2015-04841,C05739545,05/13/2015,RELEASE IN FULL,,UNCLASSIFIED\nU.S. Department of State\nCase N...
1,2,C05739546,H: LATEST: HOW SYRIA IS AIDING QADDAFI AND MOR...,H,,,2011-03-03T05:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH1/DOC_0C05739546...,F-2015-04841,...,,,,,F-2015-04841,C05739546,05/13/2015,RELEASE IN PART,"B6\nThursday, March 3, 2011 9:45 PM\nH: Latest...",UNCLASSIFIED\nU.S. Department of State\nCase N...
2,3,C05739547,CHRIS STEVENS,;H,"Mills, Cheryl D",32.0,2012-09-12T04:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH2/DOC_0C05739547...,F-2015-04841,...,B6,"Mills, Cheryl D <MillsCD@state.gov>","Abedin, Huma","Wednesday, September 12, 2012 11:52 AM",F-2015-04841,C05739547,05/14/2015,RELEASE IN PART,Thx,UNCLASSIFIED\nU.S. Department of State\nCase N...
3,4,C05739550,CAIRO CONDEMNATION - FINAL,H,"Mills, Cheryl D",32.0,2012-09-12T04:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH2/DOC_0C05739550...,F-2015-04841,...,,"Mills, Cheryl D <MillsCD@state.gov>","Mitchell, Andrew B","Wednesday, September 12,2012 12:44 PM",F-2015-04841,C05739550,05/13/2015,RELEASE IN PART,,UNCLASSIFIED\nU.S. Department of State\nCase N...
4,5,C05739554,H: LATEST: HOW SYRIA IS AIDING QADDAFI AND MOR...,"Abedin, Huma",H,80.0,2011-03-11T05:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH1/DOC_0C05739554...,F-2015-04841,...,,,,,F-2015-04841,C05739554,05/13/2015,RELEASE IN PART,"H <hrod17@clintonemail.com>\nFriday, March 11,...",B6\nUNCLASSIFIED\nU.S. Department of State\nCa...


Будем использовать поле ExtractedBodyText в предположении что всю "обвязку" из писем за нас удалили.
Письма содержат адреса электронной почты, названия документов, даты.

In [4]:
texts = df.ExtractedBodyText.fillna("")

In [5]:
texts[1].split("\n")

['B6',
 'Thursday, March 3, 2011 9:45 PM',
 'H: Latest How Syria is aiding Qaddafi and more... Sid',
 'hrc memo syria aiding libya 030311.docx; hrc memo syria aiding libya 030311.docx',
 'March 3, 2011',
 'For: Hillary']

In [6]:
# немного своих стоп-слов
my_stop_words = set(['would', 'know', 'also', 'tomorrow', 'want','think', 'today', 'sunday', 'huma',
                     'thursday', 'september'])
stop_words = set(stopwords.words("english")).union(my_stop_words)
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    # уберем email-адреса
    text = re.sub(r"<[\w\.-]+@[\w\.-]+>", "EMAIL", text.lower())
    # заменим телефонные номера
    text = re.sub(r"([0-9]|0[0-9]|1[0-9]|2[0-3])(:|-)[0-5][0-9]", "", text)
    # заменим время
    text = re.sub(r" [0-9]+\.?[0-9]*", " ", text)
    # уберем url'ы
    text = re.sub(r"https?:\\/\\/(www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\+.~#?&//=]*)", "", text)
    # уберем цифры
    text = re.sub("\d", " ", text)
    text = re.sub(r"\W+", " ", text).strip()
    text = (lemmatizer.lemmatize(word) for word in text.split() if len(word) > 3)
    text = [word for word in text if not (word in stop_words)]
    tags = nltk.pos_tag(text)
    # оставим только существительные и глаголы
    text = " ".join([tag[0] for tag in tags if ("NN" in tag[1]) or ("VB" in tag[1])])
    return text

In [7]:
corpora = re.sub(r" +", " ", " ".join((preprocess(text) for text in texts if text))).strip().split()
letters_raw = (preprocess(text) for text in texts if text)
# уберем короткие сообщения
letters = np.array([letter for letter in letters_raw if len(letter.split()) > 3])

In [8]:
letters.shape

(3798,)

# Посмотрим на биграммы

In [9]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(corpora)
finder.apply_freq_filter(3)


In [10]:
finder.nbest(bigram_measures.pmi, 10)

[('abdul', 'ghani'),
 ('begin_of_the_skype_highlighting', 'end_of_the_skype_highlighting'),
 ('hong', 'kong'),
 ('jassim', 'jabr'),
 ('marwan', 'muasher'),
 ('ardebil', 'isfahan'),
 ('ashfaq', 'parvez'),
 ('markezi', 'hamedan'),
 ('drought', 'tolerant'),
 ('jabr', 'thani')]

In [11]:
Counter(zip(corpora, corpora[1:])).most_common(10)

[(('state', 'department'), 539),
 (('secretary', 'office'), 462),
 (('department', 'state'), 364),
 (('secretary', 'state'), 261),
 (('state', 'dept'), 224),
 (('redaction', 'foia'), 216),
 (('house', 'select'), 216),
 (('information', 'redaction'), 216),
 (('produced', 'house'), 216),
 (('agreement', 'information'), 216)]

# Сделаем самую простую кластеризацию Tf-idf + SVD + Kmeans

In [12]:
vectorizer = TfidfVectorizer(ngram_range=(1, 1), min_df=2, max_df=1000)
matrix = vectorizer.fit_transform(letters).toarray()

In [13]:
svd = TruncatedSVD(n_components=2000)
features = svd.fit_transform(matrix)

In [14]:
svd.explained_variance_ratio_.sum()

0.95584916536981757

In [15]:
n_clusters = 10
kmeans = KMeans(n_clusters=n_clusters)
preds = kmeans.fit_predict(features)

# Представим кластеры топом их слов по tf-idf

In [16]:
def cluster_representation(cluster, n):
    joined_text = " ".join(cluster)
    representation = vectorizer.transform([joined_text])
    feature_array = np.array(vectorizer.get_feature_names())
    
    tfidf_sorting = np.argsort(representation.toarray()).flatten()[::-1]

    top_n = feature_array[tfidf_sorting][:n]
    return top_n

In [17]:
for i in range(n_clusters):
    print("cluster size: ", letters[preds == i].shape[0])
    print(cluster_representation(letters[preds == i], n_clusters))

cluster size:  395
['said' 'president' 'state' 'party' 'year' 'government' 'obama' 'woman'
 'clinton' 'time']
cluster size:  252
['work' 'time' 'draft' 'make' 'thanks' 'week' 'need' 'schedule' 'thank'
 'year']
cluster size:  303
['call' 'sheet' 'talk' 'email' 'called' 'need' 'confirmed' 'oscar' 'secure'
 'asked']
cluster size:  128
['state' 'benghazi' 'redaction' 'foia' 'select' 'comm' 'dept' 'produced'
 'information' 'agreement']
cluster size:  77
['office' 'secretary' 'room' 'route' 'meeting' 'residence' 'department'
 'depart' 'state' 'conference']
cluster size:  1916
['state' 'meeting' 'talk' 'sent' 'make' 'going' 'week' 'speech' 'message'
 'need']
cluster size:  123
['sullivan' 'jacob' 'email' 'mill' 'cheryl' 'abedin' 'august' 'reines'
 'wednesday' 'anne']
cluster size:  167
['mill' 'cheryl' 'email' 'friday' 'april' 'saturday' 'tuesday' 'wednesday'
 'june' 'state']
cluster size:  363
['email' 'state' 'abedin' 'wednesday' 'monday' 'saturday' 'jilotylc'
 'print' 'sullivanjj' 'august'

In [18]:
n_clusters = 10
clustering = GaussianMixture(n_components=n_clusters)
clustering.fit(features)
preds = clustering.predict(features)

In [19]:
for i in range(n_clusters):
    print("cluster size: ", letters[preds == i].shape[0])
    print(cluster_representation(letters[preds == i], n_clusters))

cluster size:  198
['mill' 'cheryl' 'email' 'sullivan' 'jacob' 'april' 'friday' 'tuesday'
 'august' 'saturday']
cluster size:  463
['state' 'said' 'president' 'government' 'year' 'house' 'party' 'obama'
 'people' 'clinton']
cluster size:  191
['time' 'send' 'cent' 'work' 'morning' 'shift' 'week' 'talk' 'need'
 'schedule']
cluster size:  118
['thanks' 'hope' 'work' 'strobe' 'year' 'jake' 'help' 'working' 'talbott'
 'email']
cluster size:  143
['office' 'secretary' 'room' 'route' 'meeting' 'department' 'residence'
 'state' 'depart' 'conference']
cluster size:  96
['draft' 'edits' 'speech' 'comment' 'work' 'change' 'jake' 'getting'
 'version' 'send']
cluster size:  62
['hrod' 'jilotylc' 'clintonemail' 'state' 'print' 'email' 'monday' 'libya'
 'october' 'tuesday']
cluster size:  326
['call' 'sheet' 'talk' 'email' 'need' 'called' 'confirmed' 'point' 'work'
 'oscar']
cluster size:  364
['email' 'abedin' 'sullivan' 'jacob' 'state' 'wednesday' 'saturday'
 'august' 'monday' 'sullivanjj']
cluste

# Попробуем dbscan

In [20]:
clustering = DBSCAN()
preds = clustering.fit_predict(features)

In [21]:
clusters = list(np.unique(preds))
for i in clusters:
    print("cluster size: ", letters[preds == i].shape[0])
    print(cluster_representation(letters[preds == i], n_clusters))

cluster size:  3743
['state' 'said' 'president' 'time' 'secretary' 'office' 'year' 'house'
 'government' 'people']
cluster size:  10
['state' 'foia' 'redaction' 'select' 'benghazi' 'comm' 'dept' 'produced'
 'information' 'date']
cluster size:  7
['mail' 'mswashdcib' 'washdc' 'received' 'state' 'vance' 'message'
 'address' 'system' 'mapi']
cluster size:  9
['office' 'secretary' 'airport' 'meeting' 'route' 'depart' 'room'
 'residence' 'york' 'laguardia']
cluster size:  5
['laszczychj' 'toiv' 'toivnf' 'laszczych' 'joanne' 'traveling' 'state'
 'access' 'contact' 'reach']
cluster size:  11
['valmoro' 'lona' 'assistant' 'secretary' 'state' 'follow' 'course'
 'office' 'work' 'examine']
cluster size:  5
['laszczych' 'reach' 'please' 'joanne' 'emergency' 'traveling' 'toiv'
 'toivnf' 'operation' 'business']
cluster size:  8
['office' 'secretary' 'meeting' 'room' 'residence' 'department' 'route'
 'conference' 'outer' 'staff']


# Попробуем topic modeling

In [23]:
n_clusters = 10

vectorizer = CountVectorizer(ngram_range=(1, 1), min_df=2, max_df=1000)
matrix = vectorizer.fit_transform(letters).toarray()
clustering = LatentDirichletAllocation(n_topics=n_clusters, max_iter=30,
                                learning_offset=50.,
                                random_state=0)
preds = clustering.fit_transform(matrix)



Для tm будем оценивать интерпритируемость с помощью терминов для каждой темы

In [24]:
n_words = 10
for topic_idx, topic in enumerate(clustering.components_):
    print("topic num: ", topic_idx)
    print(" ".join([vectorizer.get_feature_names()[i] for i in topic.argsort()[-n_words:]]))

topic num:  0
jake monday talk sent sullivan state mill cheryl call email
topic num:  1
dept agreement case date benghazi secretary information house department state
topic num:  2
family need year health help working haiti people work woman
topic num:  3
source morale soccer mcgraw lot rebel troop gillibrand ambassador feedback
topic num:  4
depart time department residence conference route room meeting secretary office
topic num:  5
organization blackberry wing wireless wilson sent wilder district charles koch
topic num:  6
clinton policy obama party government year time state president said
topic num:  7
microcredit lois lamplighter quam strategist game move kyrgyzstan procurement egypt
topic num:  8
background flotilla palin death abortion book movement skousen image beck
topic num:  9
palin romney rating approval president democrat vote voter poll percent


# Оценка интерпритируемости кластеров

На мой взгляд, для оценки интерпретируемости кластеров нужно спрашивать про кластер интерпритируем ли он?
Потому что не понятно что такое интерпретируемость (с формальной точки зрения). Собственно для этого и нужны эксперты.

Неплохо также написать развернутый манифест, в котором будут примеры интерпритируемых и плохо интерпретируемых тем.

Пример манифеста:

Хорошо интерпретируемые кластеры:

слова по теме спорт: штанга, гриф, рывок, толчок.

плохо интерпретирумый: крокодил, галоша, сыр.


Непонятно как сравнивать кластеризации с разным числом кластеров, пусть это будет доля интерпретируемых кластеров к общему числу кластеров.

По результатам кластеризации видно, что большая часть писем скатывается в один большой кластер.

Привидем результаты работы ассесора:

kmeans: 0.6

em: 0.6

dbscan: 0.625

tm: 0.8