# Data 
The data we used was a collection of tweets from NBC Health.

In [114]:
# Load the required packages
import numpy as np
import pandas as pd
import re
import csv
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics, svm
from sklearn.cluster import KMeans
from sklearn.model_selection import (
    train_test_split, learning_curve, StratifiedShuffleSplit, GridSearchCV,
    cross_val_score)

# Improve the readability of figures
sns.set_context('notebook', font_scale=1.4)
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [115]:
df = pd.read_table('bbchealth.txt', header=None)
df.head()

Unnamed: 0,0
0,585978391360221184|Thu Apr 09 01:31:50 +0000 2...
1,585947808772960257|Wed Apr 08 23:30:18 +0000 2...
2,585947807816650752|Wed Apr 08 23:30:18 +0000 2...
3,585866060991078401|Wed Apr 08 18:05:28 +0000 2...
4,585794106170839041|Wed Apr 08 13:19:33 +0000 2...


In [116]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3929 entries, 0 to 3928
Data columns (total 1 columns):
0    3929 non-null object
dtypes: object(1)
memory usage: 30.8+ KB


We can see that we have 3929 tweets in the collection. 

However, there's metadata before each of the tweet messages and a link to a bbchealth article at the end. So we will have to preprocess the data. 

# Normalization

In [117]:
example = "583659491310219264|Thu Apr 02 15:57:21 +0000 2015|Unsafe food 'growing global threat' http://bbc.in/1BREQDJ"

In [118]:
#removing stopwords

stop_words = nltk.corpus.stopwords.words('english')

In [119]:
# Remove word stems using a Porter stemmer
porter = nltk.PorterStemmer()


### Preprocessing text method 

In [120]:
def preprocess_text(messy_string):
    assert(type(messy_string) == str)
    cleaned = messy_string
    cleaned = re.sub(r'\d+\|.+\|', '', messy_string)
    cleaned = re.sub(r'(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)', '', cleaned)
    cleaned = re.sub(r'VIDEO:', '', cleaned)
    cleaned = re.sub(r'[^\w\d\s]', ' ', cleaned)
    cleaned = re.sub(r'\s+', ' ', cleaned)
    cleaned = re.sub(r'^\s+|\s+?$', '', cleaned.lower())
    return ' '.join(
        porter.stem(term) 
        for term in cleaned.split()
        if term not in set(stop_words)
    )

In [121]:
preprocess_text(example)

'unsaf food grow global threat'

In [122]:
raw_text = df[0]
processed = raw_text.apply(preprocess_text)

# Feature extraction and Vectorizing

## TfidVectorizer

In [123]:
#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000,
                                 use_idf=True, ngram_range=(1,3))

X = tfidf_vectorizer.fit_transform(processed) 

print(tfidf_matrix.shape)

from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)


(3929, 1)


# Clustering Algorithms

## K Means Clustering

In [124]:
true_k = 15
kmeans = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)


In [125]:
kmeans.fit(X)

print("Top terms per cluster:")
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = tfidf_vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :5]:
        print(' %s' % terms[ind]),
    print

Top terms per cluster:
Cluster 0:
 care
 amp
 home
 care home
 hospit
Cluster 1:
 vaccin
 ebola vaccin
 ebola
 vaccin trial
 trial
Cluster 2:
 nh
 nh staff
 staff
 plan
 strike
Cluster 3:
 boost
 exercis
 success
 ivf
 pill
Cluster 4:
 audio
 audio nh
 cancer
 drink
 amp
Cluster 5:
 health
 mental health
 mental
 health servic
 servic
Cluster 6:
 death
 drug
 hospit
 babi
 babi death
Cluster 7:
 link
 alcohol
 cancer
 drug
 dementia
Cluster 8:
 gp
 help
 patient
 hour
 fail
Cluster 9:
 ebola
 uk
 ebola nurs
 liberia
 nurs
Cluster 10:
 hospit
 doctor
 patient
 babi
 risk
Cluster 11:
 obes
 child obes
 child
 link
 cost
Cluster 12:
 cancer
 breast cancer
 breast
 cancer drug
 drug
Cluster 13:
 new
 nh
 new nh
 case
 check
Cluster 14:
 rate
 high
 concern
 death rate
 death


In [126]:
terms

['000',
 '10',
 '100',
 '12',
 '19',
 '1bn',
 '20',
 '2014',
 '2015',
 '3d',
 '40',
 '50',
 '5m',
 '80',
 '999',
 'abort',
 'abus',
 'access',
 'act',
 'action',
 'activ',
 'ad',
 'addict',
 'admit',
 'adult',
 'advert',
 'advic',
 'affect',
 'africa',
 'age',
 'ahead',
 'aid',
 'aim',
 'air',
 'air pollut',
 'alcohol',
 'alert',
 'allergi',
 'allow',
 'alzheim',
 'ambul',
 'amp',
 'amp es',
 'amp pressur',
 'amp target',
 'amp unit',
 'amp wait',
 'amp wait time',
 'anger',
 'announc',
 'answer',
 'anti',
 'antibiot',
 'anxieti',
 'apolog',
 'app',
 'appeal',
 'appoint',
 'approv',
 'area',
 'arm',
 'arrest',
 'ashya',
 'ashya king',
 'aspirin',
 'assist',
 'assist die',
 'assist suicid',
 'asthma',
 'astrazeneca',
 'attack',
 'audio',
 'audio nh',
 'autism',
 'avoid',
 'awar',
 'award',
 'babi',
 'babi born',
 'babi brain',
 'babi death',
 'bad',
 'ban',
 'bank',
 'bar',
 'basic',
 'battl',
 'beat',
 'becom',
 'bed',
 'begin',
 'belfast',
 'benefit',
 'best',
 'better',
 'bid',
 'big

## DBSCAN clustering

In [141]:
from sklearn.cluster import DBSCAN
from sklearn import metrics

db = DBSCAN(eps=0.3, min_samples=10).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)


print('Estimated number of clusters: %d' % n_clusters_)
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels))

clabels = np.unique(labels)


Estimated number of clusters: 2
Silhouette Coefficient: -0.274
