In [1]:
import numpy as np
import pandas as pd
import nltk
import re
import os
import glob
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import random

In [2]:
files = glob.glob('./individual blog data/*.txt')
file_list=[]
for names in files:
    f = open(names, 'rU')
    raw = f.read()
    file_list += [raw]

# Magical Lexicon

In [4]:
lexicon=['side','effect','breast','studi','chemotherapi','hormon','lymph','love','famili','friend',
        'treatment','trial','clinic','surgeri', 'therapi', 'research', 'eat','meat','diet','food','drink',
        'oil','alcohol','veget','tumor','immun','system','mice','kill','stem','cell','hair','dye',
        'blond','brown','bald','wig','shave','color','ovarian','bladder','detect','oral','pink','young','survivor',
        'mammogram','skin','sun','melanoma','sunscreen','tan','mole','vitamin','exposur','dermatologist',
        'children','lung','smoke','prevent','vaccin','fruit','salad','pepper','fresh','green','bodi','consumpt',
        'fat','red','sugar','fish','target','cervic', 'hpv','sexual','women','viru','mom','dad','life','fear',
        'prostat','men','surviv','attitud', 'radiat','exercis','tamoxifen','aromatas','pill','treat','water','tea','green',
        'juic','tomato','tablespoon','oliv','blood','head','hat','curli','detect','symptom','earli','diseas','colorect','risk',
        'survivor','gardasil','infect','girl','pap','protect','human','mandatori','precancer','lump','ultrasound','calori',
        'cook','live','awar','urin','kid','psa','gay','protein','jude','st','hospit','donat','childhood','organ','lifestyl',
        'smoke','smoker','drug','menopaus','transplant','marrow','bone','race','leukemia','zometa','vegetarian','node','tan',
        'ray','sunburn','damag','shade','uvb','herceptin','gallbladd','brain','pain','spf','biopsi','testicular','heal','level',
        'mouth','gum','death','tobacco','tongu','tooth','laryng','dentist','vessel','angiogenesi','glioblastoma']

In [5]:
from nltk.stem.porter import PorterStemmer

def text_process(s):
    s = s.translate(None, string.digits)
    s = s.lower()
    s = s.translate(None, string.punctuation)
    
    token_list = nltk.word_tokenize(s)
    STEMMER = PorterStemmer()
    stemming = [STEMMER.stem(tok.decode('utf-8',errors='ignore')) for tok in token_list]
    content = [w for w in stemming if w in lexicon]
    return ' '.join(content)

In [6]:
file_cleaned =[]
for i in file_list:
    k = text_process(i)
    file_cleaned += [k]

In [None]:
# using tfidf
tfidf_vectorizer = TfidfVectorizer(use_idf=True, ngram_range=(1,1))
tfidf = tfidf_vectorizer.fit_transform(file_cleaned)

In [9]:
tfidf.shape

(1594, 169)

# Try Different Clusters
* 10 clusters
* 11 clusters
* 12 clusters

In [32]:
km = KMeans(n_clusters=10)
km.fit(tfidf)
order_centroids= km.cluster_centers_.argsort()[:, ::-1]
terms = tfidf_vectorizer.get_feature_names()
for i in range(10):
    print 'Clusert %d' %i,
    for ind in order_centroids[i,:30]:
        print '%s;'%terms[ind],
    print

Wall time: 1.18 s
Clusert 0 eat; diet; food; exercis; prostat; vitamin; fruit; bodi; drink; risk; prevent; water; studi; meat; veget; fat; red; sugar; oil; juic; calori; lifestyl; cook; breast; research; fresh; consumpt; men; smoke; tablespoon;
Clusert 1 vaccin; cervic; hpv; pap; women; viru; gardasil; girl; sexual; infect; immun; human; protect; prevent; precancer; tumor; drug; effect; cell; studi; mandatori; diseas; trial; oral; young; research; system; clinic; men; friend;
Clusert 2 treatment; chemotherapi; surgeri; surviv; radiat; hospit; treat; tumor; clinic; therapi; research; studi; node; trial; breast; diseas; lymph; effect; brain; risk; side; drug; cell; life; system; live; blood; bodi; survivor; women;
Clusert 3 cell; research; studi; lung; diseas; smoke; tumor; risk; blood; bodi; death; effect; treatment; system; level; men; detect; stem; bone; organ; breast; transplant; earli; ovarian; prevent; prostat; human; live; women; colorect;
Clusert 4 life; live; famili; love; frien

In [25]:
# count each cluster
clusters = km.labels_.tolist()
blogs = {'blog': file_cleaned, 'cluster': clusters}
frame = pd.DataFrame(blogs)
frame['cluster'].value_counts()

2    354
3    346
0    219
5    191
8    140
4    109
9     92
1     60
7     49
6     34
Name: cluster, dtype: int64

In [57]:
frame.head()

Unnamed: 0,blog,cluster,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5
0,fear lifestyl studi protect research live life...,3,4,0,8,0,9
1,bodi prostat breast death prevent lifestyl liv...,3,4,0,8,0,10
2,treatment awar children women breast sexual ov...,2,2,5,0,4,0
3,hospit life children love surviv hospit life c...,0,6,5,0,4,0
4,side effect side effect side effect life side ...,4,0,4,6,5,1





# Test with Classification

In [43]:
import sklearn
from sklearn.ensemble import RandomForestClassifier
import random

In [40]:
test_ind = random.sample(range(0,1594), 478)
train_ind = [i for i in range(0,1594) if i not in test_ind]
train = tfidf[train_ind,:]
test = tfidf[test_ind,:]

In [45]:
train_y = frame['cluster'].iloc[train_ind]
test_y = frame['cluster'].iloc[test_ind]
tree = RandomForestClassifier(class_weight='auto')
model = tree.fit(train, train_y)
pred = model.predict(test)
sklearn.metrics.confusion_matrix(test_y, pred)

array([[49,  0,  8,  2,  0,  2,  0,  0,  0,  0],
       [ 2, 15,  0,  2,  0,  0,  0,  0,  1,  0],
       [ 9,  1, 90,  7,  0,  2,  0,  0,  0,  0],
       [ 3,  1, 14, 81,  0,  2,  0,  0,  1,  0],
       [ 1,  0,  4,  0, 36,  0,  0,  0,  0,  2],
       [ 0,  0,  1,  1,  0, 43,  0,  0,  8,  1],
       [ 0,  0,  0,  0,  0,  0,  8,  0,  1,  0],
       [ 1,  0,  0,  0,  0,  0,  0,  7,  0,  0],
       [ 0,  0,  4,  0,  3,  8,  0,  0, 30,  0],
       [ 1,  0,  5,  0,  2,  0,  0,  0,  0, 19]])

In [61]:
frame.to_csv('./frame_with_label.csv', index=False)