In [1]:
import pandas as pd
import numpy as np

import nltk
import os
import re

from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import NMF

from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

from collections import Counter

word_lm = WordNetLemmatizer()

#### Data: https://archive.ics.uci.edu/ml/datasets/Health+News+in+Twitter
#### using this file everydayhealth.txt

In [2]:
path = 'Health-Tweets'
col_names = ['tw_id', 'date', 'tweet']
df = pd.read_csv(os.path.join(path, 'everydayhealth.txt'), sep="|", names=col_names)
print len(df)
df.head()

3239


Unnamed: 0,tw_id,date,tweet
0,304596701757464576,Thu Feb 21 14:21:27 +0000 2013,#FastFood Makes Up 11 Percent of #Calories in ...
1,304595191329853441,Thu Feb 21 14:15:27 +0000 2013,"10 snacks to help you lose weight, burn fat, a..."
2,304587659018371072,Thu Feb 21 13:45:31 +0000 2013,10 foods that boost your skin AND slim your wa...
3,304580073380524032,Thu Feb 21 13:15:22 +0000 2013,What a heart attack feels like in women (it's ...
4,304572560270573569,Thu Feb 21 12:45:31 +0000 2013,#McDonalds oatmeal has almost 7 teaspoons of s...


In [3]:
# some preprocessing

def pre_processing(data_frame, txt_clm):
    
    df_txt = data_frame.copy()
    
    # lower case 
    df_txt['lower_txt'] = df_txt[txt_clm].str.lower()
    
    #replace RTs with ''
    df_txt['links_rm'] = df_txt['lower_txt'].apply(lambda v: v.replace('rt', ''))
    
    # remove punctuation
    df_txt['punc_rm'] = df_txt['lower_txt'].str.replace('[^\w\s]','')
    
    # replace hyperlinks with ''
    df_txt['links_rm'] = df_txt['punc_rm'].apply(lambda v: re.sub(r'http\S+', '', v))
    
    # remove stopwords
    df_txt['stopwords_rm'] = df_txt['links_rm'].apply(lambda x: " ".join(x for x in x.split() if x not in ENGLISH_STOP_WORDS))
    
     # lemmatize all the words
    df_txt['lemmatize_text'] = df_txt['stopwords_rm'].apply(lambda v: " ".join([word_lm.lemmatize(i) for i in v.split()]))
    
    df_txt['processed_text'] = df_txt['lemmatize_text'].apply(lambda x: word_tokenize(x))
    
    df_txt['raw_text'] = df_txt['processed_text'].apply(lambda x: " ".join(w for w in x if len(w)> 3))
    
    return df_txt[['tweet', 'raw_text']]

tweets_data = df.copy()
processed_df = pre_processing(tweets_data[['tweet']], 'tweet')
processed_df['raw_text'].tolist()[:10]

[u'fastfood make percent calorie diet',
 u'snack help lose weight burn build muscle',
 u'food boost skin slim waistline',
 u'heart attack feel like woman different gored',
 u'mcdonalds oatmeal teaspoon sugar healthy fast food isnt',
 u'food boost skin slim waistline',
 u'skipping kegels using talcum powder vaginal health mistake youre probably making',
 u'food boost skin slim waistline',
 u'today happier perform random kindness itll make feel better',
 u'depression isnt everybody different face disorder']

In [4]:
# try a groupby of raw text, since preprocessing now we are able to group them
group_text = processed_df.groupby(['raw_text'], as_index=False).count()
group_text = group_text.rename(columns={'tweet': 'count'})
group_text.sort_values('count', ascending=False).head(10)

Unnamed: 0,raw_text,count
725,everyday health daily digest,42
817,food boost skin slim waistline,24
824,food longer life,22
542,eating habit pack pound,14
1853,soda drinker listen reason kick soda habit,12
1353,meat surprising food thatll spike blood sugar,11
785,fiberrich food diet asap,11
247,calorie today,10
14,adhd characteristic actually huge plus choosin...,10
968,health symptom shouldnt ignore menshealth,9


In [5]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(processed_df['raw_text'])
feature_names = vectorizer.get_feature_names() # num phrases 
feature_df = pd.DataFrame(X.toarray(), columns=feature_names)
feature_df.head(3)

Unnamed: 0,1000,1089,11am,11ampst,1200,12noon,13for13,13in2013,1520lbs,1pmet,...,youself,youth,youtube,youve,yummy,zero,zerocalorie,zocor,zucchini,zzzs
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# get top n features
feature_array = np.array(feature_names)
tfidf_sorting = np.argsort(X.toarray()).flatten()[::-1]

n = 30
top_n = feature_array[tfidf_sorting][:n]
print top_n

[u'worry' u'sooner' u'highincome' u'sicker' u'country' u'american' u'zzzs'
 u'fightdepression' u'finger' u'finding' u'financial' u'finally'
 u'filtered' u'filling' u'figure' u'fighting' u'fiery' u'fight'
 u'fitandfabliving' u'field' u'fibromyalgia' u'fibrillation' u'fiberrich'
 u'fiber' u'fewer' u'fever' u'feng' u'fend' u'fence' u'fish']


In [7]:
# apply k means cluster, choosing k as 7
true_k = 7
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)
clusters = model.labels_.tolist()
cluster_items = Counter(clusters)
for k in cluster_items:
    print "Cluster {}: {}".format(k, cluster_items[k])

Cluster 0: 1656
Cluster 1: 307
Cluster 2: 116
Cluster 3: 144
Cluster 4: 106
Cluster 5: 49
Cluster 6: 861


In [8]:
#sort cluster centers by proximity to centroid
cl_cols = ['Cluster 0', 'Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4', 'Cluster 5', 'Cluster 6']
cluster_df = pd.DataFrame(columns = cl_cols)

order_centroids = model.cluster_centers_.argsort()[:, ::-1]

for i in range(true_k):
    join_words = []
    for ind in order_centroids[i, :20]:
        join_words.append(feature_names[ind])
    cluster_df["Cluster {}".format(i)] = join_words
    
cluster_df.head(10)

Unnamed: 0,Cluster 0,Cluster 1,Cluster 2,Cluster 3,Cluster 4,Cluster 5,Cluster 6
0,exercise,food,reason,best,really,pound,healthtalk
1,health,weight,soda,worst,sleep,pack,everydayhealth
2,jillianmichaels,lose,fish,food,work,habit,today
3,calorie,life,lifeenhancing,waistline,share,eating,health
4,better,longer,losing,skin,healthyliving,lose,eatsmartbd
5,healthy,loss,drinker,slim,weight,huge,join
6,help,diet,kick,boost,need,characteristic,everyday
7,secret,sugar,listen,protein,remedy,adhdfriendly,digest
8,diet,spike,chocolate,snack,home,career,daily
9,risk,blood,youre,treat,lossspiration,choosing,psoriasis


In [9]:
topic_cols = ['Topic 0', 'Topic 1', 'Topic 2', 'Topic 3', 'Topic 4', 'Topic 5', 'Topic 6']
topics_df = pd.DataFrame(columns=topic_cols)

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        
        top_words = []
        for i in topic.argsort()[:-no_top_words - 1:-1]:
            top_words.append(feature_names[i])
                             
        topics_df["Topic {}".format(topic_idx)] = top_words
    return topics_df
        
    
        
# Run NMF
no_topics = 7
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(X)

no_top_words = 20
display_topics(nmf, feature_names, no_top_words).head(10)

Unnamed: 0,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6
0,digest,food,healthtalk,weight,calorie,boost,eating
1,everyday,life,join,lose,today,waistline,habit
2,daily,longer,everydayhealth,loss,burn,slim,pound
3,health,change,follow,help,chocolate,skin,pack
4,worst,worst,eatsmartbd,whats,easy,food,healthy
5,snack,sugar,heart,healthy,sneaky,best,soda
6,ignore,spike,chat,trying,just,blood,reason
7,shouldnt,blood,vegan,reason,reason,sugar,complexion
8,2013,asap,question,dieting,skip,plus,drinker
9,symptom,fiberrich,today,losing,tomorrow,metabolism,kick
