# Loading and Cleaning Twitter Data

<b> Import the necessary libraries </b>

In [1]:
import langdetect
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
import numpy as np
import pandas as pd
import pyLDAvis
import pyLDAvis.sklearn
import regex
import sklearn
import warnings
warnings.filterwarnings('ignore')

<b> Load the LA Times health Twitter data </b>

In [2]:
df = pd.read_csv('datasets/latimeshealth.txt', sep='|', header=None)
df.head()

Unnamed: 0,0,1,2
0,576760256031682561,Sat Mar 14 15:02:15 +0000 2015,Five new running shoes that aim to go the extr...
1,576715414811471872,Sat Mar 14 12:04:04 +0000 2015,Gym Rat: Disq class at Crunch is intense worko...
2,576438353555365888,Fri Mar 13 17:43:07 +0000 2015,Noshing through thousands of ideas at Natural ...
3,576438347003908096,Fri Mar 13 17:43:06 +0000 2015,"Natural Products Expo also explores beauty, su..."
4,576413058177712128,Fri Mar 13 16:02:36 +0000 2015,Free Fitness Weekends in South Bay beach citie...


In [3]:
# name the columns
df.columns = ['id', 'datetime', 'tweet']

<b> Run a quick exploratory analysis to ascertain data size and structure </b>

In [4]:
# create a function to look the data structure
def dataframe_look(df, nrows):
    print(f'SHAPE:\n{df.shape}')
    print(f'COLUMN NAME:\n{df.columns}')
    print(f'HEAD:\n{df.head(nrows)}')

In [5]:
dataframe_look(df, 2)

SHAPE:
(4171, 3)
COLUMN NAME:
Index(['id', 'datetime', 'tweet'], dtype='object')
HEAD:
                   id                        datetime  \
0  576760256031682561  Sat Mar 14 15:02:15 +0000 2015   
1  576715414811471872  Sat Mar 14 12:04:04 +0000 2015   

                                               tweet  
0  Five new running shoes that aim to go the extr...  
1  Gym Rat: Disq class at Crunch is intense worko...  


<b> Extract the tweet text and convert it to a list object </b>

In [6]:
raw = df['tweet'].tolist()

In [7]:
print(f'HEADLINES:\n{raw[:5]}')
print(f'LENGTH:\n{len(raw)}')

HEADLINES:
['Five new running shoes that aim to go the extra mile http://lat.ms/1ELp3wU', 'Gym Rat: Disq class at Crunch is intense workout on pulley system http://lat.ms/1EKOFdr', 'Noshing through thousands of ideas at Natural Products Expo West http://lat.ms/1EHqywg', 'Natural Products Expo also explores beauty, supplements and more http://lat.ms/1EHqyfE', 'Free Fitness Weekends in South Bay beach cities aim to spark activity http://lat.ms/1EH3SMC']
LENGTH:
4171


<b> Write a function to perform language detection and tokenization on white spaces, and then replace the screen names and URLs with SCREENNAME and URL, respectively. The function should also remove punctuation, numbers, and the SCREENNAME and URL replacements. Convert everything to lowercase, except SCREENNAME and URL. It should remove all stop words, perform lemmatization, and keep words with five or more letters only </b>

In [8]:
# identify language
def lang_ident(txt):
    try:
       the_language = langdetect.detect(txt)
    except:
       the_language = 'none'
    return the_language

# lemmatization
def do_lemmatizing(wrd):
    out = nltk.corpus.wordnet.morphy(wrd)
    return (wrd if out is None else out)

In [9]:
def tweet_cleaning(txt):
    # identify language of tweet
    # return null if language not english
    lg = lang_ident(txt)
    if lg != 'en':
        return None
    
    # split the string on whitespace
    out = txt.split(' ')
    
    # identify screen names
    out = ['SCREENNAME' if i.startswith('@') else i for i in out]
    
    # identify urls
    out = [
        'URL' if bool(regex.search('http[s]?://', i)) 
        else i for i in out
    ]
    
    # remove all punctuation
    out = [regex.sub('[^\\w\\s]|\n', '', i) for i in out]
    
    # make all non-keywords lowercase
    keys = ['SCREENNAME', 'URL']
    out = [i.lower() if i not in keys else i for i in out]
    
    # remove keywords
    out = [i for i in out if i not in keys]
    
    # remove stopwords
    list_stop_words = nltk.corpus.stopwords.words('english')
    list_stop_words = [regex.sub('[^\\w\\s]', '', i) for i in list_stop_words]
    
    out = [i for i in out if i not in list_stop_words]
    
    # lemmatizing
    out = [do_lemmatizing(i) for i in out]
    
    # keep words 4 or more characters long
    out = [i for i in out if len(i) >= 5]
    
    return out

<b> Apply the function defined in Step 5 to every tweet </b>

In [10]:
clean = list(map(tweet_cleaning, raw))

<b> Remove elements of the output list equal to None </b>

In [11]:
clean = list(filter(None.__ne__, clean))

In [12]:
print(f'HEADLINES:\n{clean[:5]}\n')
print(f'LENGTH:\n{len(clean)}')

HEADLINES:
[['running', 'shoes', 'extra'], ['class', 'crunch', 'intense', 'workout', 'pulley', 'system'], ['thousand', 'natural', 'product'], ['natural', 'product', 'explore', 'beauty', 'supplement'], ['fitness', 'weekend', 'south', 'beach', 'spark', 'activity']]

LENGTH:
4084


<b> Turn the elements of each tweet back into a string. Concatenate using white space </b>

In [13]:
clean_sentences = [' '.join(i) for i in clean]
print(clean_sentences[:5])

['running shoes extra', 'class crunch intense workout pulley system', 'thousand natural product', 'natural product explore beauty supplement', 'fitness weekend south beach spark activity']


# LDA and Health Tweets

<b> Specify the number_words, number_docs, and number_features variables </b>

In [14]:
number_words = 10
number_docs = 10
number_features = 1000

<b> Create a bag-of-words model and assign the feature names to another variable for use later on </b>

In [15]:
vectorizer1 = sklearn.feature_extraction.text.CountVectorizer(analyzer="word",
                                                              max_df=0.95,
                                                              min_df=10,
                                                              max_features=number_features)

In [16]:
clean_vec1 = vectorizer1.fit_transform(clean_sentences)
print(clean_vec1[0]) 

  (0, 321)	1


In [17]:
feature_names_vec1 = vectorizer1.get_feature_names()

<b> Identify the optimal number of topics </b>

In [18]:
def perplexity_by_ntopic(data, ntopics):
    
    output_dict = {"Number Of Topics": [], "Perplexity Score": []}
    
    for t in ntopics:
        lda = sklearn.decomposition.LatentDirichletAllocation(n_components=t,
                                                              learning_method="online",
                                                              random_state=0)
        
        lda.fit(data)
        
        output_dict["Number Of Topics"].append(t)
        output_dict["Perplexity Score"].append(lda.perplexity(data))
        
    output_df = pd.DataFrame(output_dict)
    
    index_min_perplexity = output_df["Perplexity Score"].idxmin()
    output_num_topics = output_df.loc[
        index_min_perplexity,  # index
        "Number Of Topics"  # column
    ]
        
    return (output_df, output_num_topics)

In [19]:
df_perplexity, optimal_num_topics = perplexity_by_ntopic(clean_vec1,
                                                         ntopics=[i for i in range(1, 21) if i % 2 == 0])

In [20]:
print(df_perplexity)

   Number Of Topics  Perplexity Score
0                 2        351.847186
1                 4        399.638246
2                 6        436.261164
3                 8        458.095346
4                10        476.362744
5                12        492.366014
6                14        501.756317
7                16        514.864902
8                18        530.411569
9                20        528.501201


<b> Fit the LDA model using the optimal number of topics </b>

In [21]:
lda = sklearn.decomposition.LatentDirichletAllocation(n_components=optimal_num_topics,
                                                      learning_method="online",
                                                      random_state=0)

lda.fit(clean_vec1)

LatentDirichletAllocation(learning_method='online', n_components=2,
                          random_state=0)

<b> Create and print the word-topic table </b>

In [22]:
def get_topics(mod, vec, names, docs, ndocs, nwords):
    
    # word to topic matrix
    W = mod.components_
    W_norm = W / W.sum(axis=1)[:, np.newaxis]
    
    # topic to document matrix
    H = mod.transform(vec)
    
    W_dict = {}
    H_dict = {}
    
    for tpc_idx, tpc_val in enumerate(W_norm):
        topic = f"Topic{tpc_idx}"
        
        # formatting w
        W_indices = tpc_val.argsort()[::-1][:nwords]
        W_names_values = [
            (round(tpc_val[j], 4), names[j]) 
            for j in W_indices
        ]
        W_dict[topic] = W_names_values
        
        # formatting h
        H_indices = H[:, tpc_idx].argsort()[::-1][:ndocs]
        H_names_values = [
            (round(H[:, tpc_idx][j], 4), docs[j]) 
            for j in H_indices
        ]
        H_dict[topic] = H_names_values
        
    W_df = pd.DataFrame(
        W_dict, 
        index=["Word" + str(i) for i in range(nwords)]
    )
    H_df = pd.DataFrame(
        H_dict,
        index=["Doc" + str(i) for i in range(ndocs)]
    )
        
    return (W_df, H_df)

In [23]:
W_df, H_df = get_topics(mod=lda,
                        vec=clean_vec1,
                        names=feature_names_vec1,
                        docs=raw,
                        ndocs=number_docs,
                        nwords=number_words)

In [24]:
print(H_df)

                                                 Topic0  \
Doc0  (0.9374, RT @skgire: For my non-science friend...   
Doc1  (0.9355, Missionaries stricken with Ebola viru...   
Doc2  (0.9351, Flu shots reduce the risk of heart at...   
Doc3  (0.935, Kids got the biggest boost from this y...   
Doc4  (0.9331, Expert panel from @RANDCorporation, @...   
Doc5  (0.9318, Eight commonplace ingredients in make...   
Doc6          (0.9283, @itsmeyer Ha ha ha, yes indeed!)   
Doc7  (0.9283, You can't trust your drug dealer, res...   
Doc8                   (0.9277, @nancylayton4 You too!)   
Doc9  (0.9268, 11% of hospital patients got care the...   

                                                 Topic1  
Doc0                   (0.9399, @mycarecircles Thanks!)  
Doc1  (0.9385, 74% of doctors surveyed said they hav...  
Doc2  (0.9385, Boston Children's Hospital announces ...  
Doc3  (0.9373, Little innovations make a big differe...  
Doc4  (0.9371, From the Dept. of Happy Accidental Di...  
Do

<b> Print the document-topic table </b>

In [25]:
print(W_df)

                     Topic0                Topic1
Word0       (0.0519, study)       (0.039, latfit)
Word1       (0.0216, could)       (0.0328, study)
Word2       (0.0204, brain)      (0.0316, health)
Word3      (0.0169, report)      (0.0292, cancer)
Word4   (0.0157, scientist)      (0.0226, people)
Word5      (0.0155, weight)       (0.0167, woman)
Word6  (0.0151, california)  (0.0166, researcher)
Word7     (0.0133, medical)       (0.0164, death)
Word8    (0.0124, research)     (0.0164, patient)
Word9    (0.011, treatment)     (0.0161, obesity)


<b> Create a biplot visualization </b>

In [26]:
lda_plot = pyLDAvis.sklearn.prepare(lda, clean_vec1, vectorizer1, R=10)
pyLDAvis.display(lda_plot)

# Non-negative Matrix Factorization

<b> Create the appropriate bag-of-words model and output the feature names as another variable </b>

In [27]:
vectorizer2 = sklearn.feature_extraction.text.TfidfVectorizer(analyzer="word",
                                                              max_df=0.5,
                                                              min_df=20,
                                                              max_features=number_features,
                                                              smooth_idf=False)

In [28]:
clean_vec2 = vectorizer2.fit_transform(clean_sentences)
print(clean_vec2[0])




In [29]:
feature_names_vec2 = vectorizer2.get_feature_names()

<b> Define and fit the NMF algorithm using the number of topics </b>

In [30]:
nmf = sklearn.decomposition.NMF(n_components=optimal_num_topics,
                                init="nndsvda",
                                solver="mu",
                                beta_loss="frobenius",
                                random_state=0,
                                alpha=0.1,
                                l1_ratio=0.5)
nmf.fit(clean_vec2) 

NMF(alpha=0.1, init='nndsvda', l1_ratio=0.5, n_components=2, random_state=0,
    solver='mu')

<b> Get the topic-document and word-topic tables. Take a few minutes to explore the word groupings and try to define the abstract topics. Can you quantify the meanings of the word groupings? Do the word groupings make sense? Are the results similar to those produced using LDA? </b>

In [31]:
W_df, H_df = get_topics(mod=nmf,
                        vec=clean_vec2,
                        names=feature_names_vec2,
                        docs=raw,
                        ndocs=number_docs,
                        nwords=number_words)

In [32]:
print(W_df)

                  Topic0                Topic1
Word0    (0.3735, study)      (0.5947, latfit)
Word1    (0.026, cancer)       (0.0485, steps)
Word2   (0.0208, people)       (0.0444, today)
Word3   (0.0186, health)      (0.04, exercise)
Word4  (0.0185, obesity)  (0.0272, healthtips)
Word5    (0.0182, brain)     (0.0255, workout)
Word6  (0.0174, suggest)      (0.022, fitness)
Word7   (0.0168, weight)     (0.0202, getting)
Word8    (0.0154, woman)       (0.0142, great)
Word9    (0.0131, death)     (0.0131, morning)


In [33]:
print(H_df)

                                                 Topic0  \
Doc0  (0.2031, Move over FTO. Scientists now say the...   
Doc1  (0.2031, Researchers comb through measurements...   
Doc2  (0.2031, The world's oldest, most widely disse...   
Doc3  (0.2031, Life with #breastcancer, from @nbcpar...   
Doc4  (0.2031, Understanding healthcare reform: A pi...   
Doc5  (0.2031, When "free" actually means "really ex...   
Doc6  (0.2031, Docs' intensive 'get healthy' program...   
Doc7  (0.2031, FDA approves first drug treatment for...   
Doc8  (0.2031, Try This! A ballet-inspired 'booty li...   
Doc9  (0.2031, RT @latimesscience: Does your lettuce...   

                                                 Topic1  
Doc0  (0.2272, There are laws that protect kids from...  
Doc1  (0.2272, Farm subsidies from the government ar...  
Doc2  (0.2272, Did you know baked beans, soy sauce a...  
Doc3  (0.2272, Obama administration delays the healt...  
Doc4  (0.2272, Study finds chronic brain damage in r...  
Do