# Пример: кластеризация текстов простыми методами

In [149]:
import itertools

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

from matplotlib import pyplot as plt
from matplotlib.colors import ListedColormap

from IPython.display import Image, SVG

%matplotlib inline

## Датасет

In [150]:
from sklearn.datasets import fetch_20newsgroups

In [151]:
train_all = fetch_20newsgroups(subset='train')
print (train_all.target_names)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [152]:
simple_dataset = fetch_20newsgroups(
    subset='train', 
    categories=['comp.sys.mac.hardware', 'soc.religion.christian', 'rec.sport.hockey'])

Пример текста

In [153]:
print(simple_dataset.data[0])

From: erik@cheshire.oxy.edu (Erik Adams)
Subject: HELP!!  My Macintosh "luggable" has lines on its screen!
Organization: Occidental College, Los Angeles, CA 90041 USA.
Distribution: comp
Lines: 20

Okay, I don't use it very much, but I would like for it to keep working
correctly, at least as long as Apple continues to make System software
that will run on it, if slowly :-)

Here is the problem:  When the screen is tilted too far back, vertical
lines appear on the screen.  They are every 10 pixels or so, and seem
to be affected somewhat by opening windows and pulling down menus.
It looks to a semi-technical person like there is a loose connection
between the screen and the rest of the computer.

I am open to suggestions that do not involve buying a new computer,
or taking this one to the shop.  I would also like to not have
to buy one of Larry Pina's books.  I like Larry, but I'm not sure
I feel strongly enough about the computer to buy a service manual
for it.

On a related note:  what

### Признаки

In [154]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=500, min_df=10)
matrix = vectorizer.fit_transform(simple_dataset.data)
matrix.shape

(1777, 3767)

## Аггломеративная кластеризация

In [155]:
from sklearn.cluster.hierarchical import AgglomerativeClustering

model = AgglomerativeClustering(n_clusters=3, affinity='cosine', linkage='complete')
preds = model.fit_predict(matrix.toarray())

In [156]:
print(list(preds) [:10])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [157]:
simple_dataset.target

array([0, 0, 1, ..., 0, 1, 2])

In [158]:
preds

array([0, 0, 0, ..., 0, 2, 1])

In [159]:
# Assessement
mapping = {2 : 1, 1: 2, 0: 0}
mapped_preds = [mapping[pred] for pred in preds]
# print (float(sum(mapped_preds != simple_dataset.target)) / len(simple_dataset.target))
print(accuracy_score(mapped_preds, simple_dataset.target))

0.3590320765334834


In [160]:
def validate_with_mappings(preds, target):
    permutations = itertools.permutations([0, 1, 2])
    accuracy_history = []
    for a, b, c in permutations:
        mapping = {2 : a, 1: b, 0: c}
        mapped_preds = [mapping[pred] for pred in preds]
        accuracy_history.append(accuracy_score(mapped_preds, target))
    return np.max(accuracy_history)
    
#validate_with_mappings(preds, simple_dataset.target)

## KMeans

In [161]:
from sklearn.cluster import KMeans

model = KMeans(n_clusters=3, random_state=1)
preds = model.fit_predict(matrix.toarray())
print (preds)
print (simple_dataset.target)
validate_with_mappings(preds, simple_dataset.target)

[0 0 2 ... 0 2 1]
[0 0 1 ... 0 1 2]


0.9527293190770962

In [162]:
# Compare with Linear Regression
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
print (cross_val_score(clf, matrix, simple_dataset.target).mean())

0.9853603185880773




**Вопрос:** очень высокая точность кластеризации текстов, очень близкая к точности Supervised алгоритма. Почему?

## Более сложная выборка

In [180]:
noteasy_dataset = fetch_20newsgroups(
    subset='train', 
    categories=['comp.sys.mac.hardware', 'comp.os.ms-windows.misc', 'comp.graphics'])
matrix = vectorizer.fit_transform(noteasy_dataset.data)

In [181]:
for t, l in zip(noteasy_dataset.data[:5], noteasy_dataset.target[:5]):
    print(l)
    print(t)
    print('\n\n-------\n\n')

2
From: rogntorb@idt.unit.no (Torbj|rn Rognes)
Subject: Adding int. hard disk drive to IIcx
Keywords: Mac IIcx, internal, hard disk drive, SCSI
Reply-To: rogntorb@idt.unit.no (Torbj|rn Rognes)
Organization: Div. of CS & Telematics, Norwegian Institute of Technology
Lines: 32

I haven't seen much info about how to add an extra internal disk to a
mac. We would like to try it, and I wonder if someone had some good
advice.

We have a Mac IIcx with the original internal Quantum 40MB hard disk,
and an unusable floppy drive. We also have a new spare Connor 40MB
disk which we would like to use. The idea is to replace the broken
floppy drive with the new hard disk, but there seems to be some
problems:

The internal SCSI cable and power cable inside the cx has only
connectors for one single hard disk drive.

If I made a ribbon cable and a power cable with three connectors each
(1 for motherboard, 1 for each of the 2 disks), would it work?

Is the IIcx able to supply the extra power to the extra 

In [182]:
model = KMeans(n_clusters=3, random_state=1)
preds = model.fit_predict(matrix.toarray())
print (preds)
print (noteasy_dataset.target)
validate_with_mappings(preds, noteasy_dataset.target)

[0 1 2 ... 0 2 0]
[2 1 1 ... 2 0 2]


0.753565316600114

In [183]:
clf = LogisticRegression()
print (cross_val_score(clf, matrix, noteasy_dataset.target).mean())

0.917279226713189




## SVD + KMeans

In [184]:
from sklearn.decomposition import TruncatedSVD

model = KMeans(n_clusters=3, random_state=42)
svd = TruncatedSVD(n_components=1000, random_state=123)
features = svd.fit_transform(matrix)
preds = model.fit_predict(features)
validate_with_mappings(preds, noteasy_dataset.target)

0.793496862521392

In [206]:
model = KMeans(n_clusters=3, random_state=42)
svd = TruncatedSVD(n_components=200, random_state=321)
features = svd.fit_transform(matrix)
preds = model.fit_predict(features)
validate_with_mappings(preds, noteasy_dataset.target)

0.7347404449515117


Подозрительно хорошо. Есть идеи, почему?

## Как добиваться разбиения на нужные вам кластеры

In [186]:
words_mapping = {
    'graphics': 0, 
    'pixel': 0,
    'mac': 2,
    'win': 1,
    'windows': 1,
    'ios': 2,
    'macintosh': 2
}

In [187]:
from collections import defaultdict
def dummy_clf(input_text):
    scores = defaultdict(int)
    words = input_text.lower().split()
    for word in words:
        l = words_mapping.get(word, -1)
        if l != -1:
            scores[l] += 1
    
    if not scores:
        return -1
    else:
        max_score_l, max_score = max(scores.items(), key = lambda z: z[1])
        return max_score_l

In [188]:
dummy_clf('computer graphics pixel mac')

0

In [189]:
dummy_clf('pixel mac')

0

In [190]:
dummy_clf('')

-1

In [191]:
labels = np.full(len(y), -1.)
for i in range(len(labels)):
    labels[i] = dummy_clf(noteasy_dataset.data[i])

In [192]:
unique, counts = np.unique(labels, return_counts=True)
dict(zip(unique, counts))

{-1.0: 947, 0.0: 186, 1.0: 395, 2.0: 225}

In [193]:
unlabeled_indices = [k for k, l in enumerate(labels) if l == -1]

In [194]:
lp_model = label_propagation.LabelPropagation(kernel='rbf', n_neighbors=10, max_iter=200)#, alpha=0.01)
lp_model.fit(matrix.toarray(), labels)

predicted_labels = lp_model.transduction_[unlabeled_indices]
true_labels = y[unlabeled_indices]

cm = confusion_matrix(true_labels, predicted_labels,
                          labels=lp_model.classes_)



In [195]:
print(cm)

[[185 124  60]
 [ 18 180  13]
 [ 38  87 242]]


In [196]:
accuracy_score(true_labels, predicted_labels)

0.6409714889123548