In [147]:
import re
import operator
import pandas as pd
import numpy as np
import json
import io
import datetime as dt
import string
import unicodedata
from collections import Counter


import nltk
import spacy
import en_core_web_sm
from spacy import displacy
from nltk.tokenize.toktok import ToktokTokenizer
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
nlp = spacy.load('en_core_web_sm')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import corpora, models, similarities

pd.set_option('display.max_rows', 5000)

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from watson_developer_cloud import NaturalLanguageUnderstandingV1
from watson_developer_cloud.natural_language_understanding_v1 import Features, RelationsOptions, CategoriesOptions

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/prachi.agrawal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/prachi.agrawal/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Read file of test tweets

In [148]:
df = pd.read_csv('Tweets.csv')
print('Number of observations are: '+str(len(df)))

Number of observations are: 14640


In [149]:
#remove any rows that has no tweet text
df = df.text.dropna()
df = df.reset_index(drop=True)
print('Number of observations are: '+str(len(df)))

Number of observations are: 14640


## Cleaning of test tweets

In [150]:
def strip_links(text):
    link_regex    = re.compile('((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
    links         = re.findall(link_regex, text)
    for link in links:
        text = text.replace(link[0], ', ')    
    return text

def strip_mentions(text):
    entity_prefixes = ['@']
    for separator in  string.punctuation:
        if separator not in entity_prefixes :
            text = text.replace(separator,' ')
    words = []
    for word in text.split():
        word = word.strip()
        if word:
            if word[0] not in entity_prefixes:
                words.append(word)
    return ' '.join(words)

def strip_hashtags(text):
    entity_prefixes = ['#']
    for separator in  string.punctuation:
        if separator not in entity_prefixes :
            text = text.replace(separator,' ')
    words = []
    for word in text.split():
        word = word.strip()
        if word:
            if word[0] not in entity_prefixes:
                words.append(word)
    return ' '.join(words)

In [151]:
for i in range(0,len(df)):
    df[i]=strip_links(df[i])
for i in range(0,len(df)):
    df[i]=strip_mentions(df[i])
for i in range(0,len(df)):
    df[i]=strip_hashtags(df[i])
for i in range(0,len(df)):
    df[i] = df[i].replace('RT', '')

In [152]:
cList = {
  "ain't": "am not",
  "aren't": "are not",
  "can't": "cannot",
  "can't've": "cannot have",
  "'cause": "because",
  "could've": "could have",
  "couldn't": "could not",
  "couldn't've": "could not have",
  "didn't": "did not",
  "doesn't": "does not",
  "don't": "do not",
  "hadn't": "had not",
  "hadn't've": "had not have",
  "hasn't": "has not",
  "haven't": "have not",
  "he'd": "he would",
  "he'd've": "he would have",
  "he'll": "he will",
  "he'll've": "he will have",
  "he's": "he is",
  "how'd": "how did",
  "how'd'y": "how do you",
  "how'll": "how will",
  "how's": "how is",
  "I'd": "I would",
  "I'd've": "I would have",
  "I'll": "I will",
  "I'll've": "I will have",
  "I'm": "I am",
  "I've": "I have",
  "isn't": "is not",
  "it'd": "it had",
  "it'd've": "it would have",
  "it'll": "it will",
  "it'll've": "it will have",
  "it's": "it is",
  "let's": "let us",
  "ma'am": "madam",
  "mayn't": "may not",
  "might've": "might have",
  "mightn't": "might not",
  "mightn't've": "might not have",
  "must've": "must have",
  "mustn't": "must not",
  "mustn't've": "must not have",
  "needn't": "need not",
  "needn't've": "need not have",
  "o'clock": "of the clock",
  "oughtn't": "ought not",
  "oughtn't've": "ought not have",
  "shan't": "shall not",
  "sha'n't": "shall not",
  "shan't've": "shall not have",
  "she'd": "she would",
  "she'd've": "she would have",
  "she'll": "she will",
  "she'll've": "she will have",
  "she's": "she is",
  "should've": "should have",
  "shouldn't": "should not",
  "shouldn't've": "should not have",
  "so've": "so have",
  "so's": "so is",
  "that'd": "that would",
  "that'd've": "that would have",
  "that's": "that is",
  "there'd": "there had",
  "there'd've": "there would have",
  "there's": "there is",
  "they'd": "they would",
  "they'd've": "they would have",
  "they'll": "they will",
  "they'll've": "they will have",
  "they're": "they are",
  "they've": "they have",
  "to've": "to have",
  "wasn't": "was not",
  "we'd": "we had",
  "we'd've": "we would have",
  "we'll": "we will",
  "we'll've": "we will have",
  "we're": "we are",
  "we've": "we have",
  "weren't": "were not",
  "what'll": "what will",
  "what'll've": "what will have",
  "what're": "what are",
  "what's": "what is",
  "what've": "what have",
  "when's": "when is",
  "when've": "when have",
  "where'd": "where did",
  "where's": "where is",
  "where've": "where have",
  "who'll": "who will",
  "who'll've": "who will have",
  "who's": "who is",
  "who've": "who have",
  "why's": "why is",
  "why've": "why have",
  "will've": "will have",
  "won't": "will not",
  "won't've": "will not have",
  "would've": "would have",
  "wouldn't": "would not",
  "wouldn't've": "would not have",
  "y'all": "you all",
  "y'alls": "you alls",
  "y'all'd": "you all would",
  "y'all'd've": "you all would have",
  "y'all're": "you all are",
  "y'all've": "you all have",
  "you'd": "you had",
  "you'd've": "you would have",
  "you'll": "youyou will",
  "you'll've": "you will have",
  "you're": "you are",
  "you've": "you have"
}

c_re = re.compile('(%s)' % '|'.join(cList.keys()))

def expandContractions(text, c_re=c_re):
    def replace(match):
        return cList[match.group(0)]
    text = c_re.sub(replace, text.lower())
    return text

def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [154]:
for i in range(0,len(df)):
    df[i]=expandContractions(df[i])
for i in range(0,len(df)):
    df[i]=remove_special_characters(df[i], remove_digits=True)
# for i in range(0,len(df)):
#     df[i]=remove_stopwords(df[i])

df.replace('', np.nan, inplace=True)
df.dropna(inplace=True)
#df.to_csv("cleaned_tweets.csv", header = False, index=False)

## Get Categories and Segregate test tweets into 4 files of test tweets

In [160]:
natural_language_understanding = NaturalLanguageUnderstandingV1(
    version='2018-03-16',
    iam_apikey='KN0q1fJW6X-YrL9xgmyqgl-TUohn3sWNYGnk7HJz-IPY',
    url='https://gateway.watsonplatform.net/natural-language-understanding/api'
)

health = []
entertainment = []
food = []
politics = [] 
sports = [] 

for i in range(0,len(df)):
    try:
        
        if len(df[i]) < 15 or len(df[i].split(" ")) < 3:
            continue

        response = natural_language_understanding.analyze(
            text= df[i] ,
            features=Features(categories=CategoriesOptions())
        ).get_result()

        if 'categories' not in response:
            continue

        if len(response["categories"]) == 0:
            continue

        data = response["categories"][0]['label']

        if len(data) >= 1:
            category = data.split("/")[1]

            if category == "health and fitness":
                health.append(df[i])
            elif category == "art and entertainment":
                entertainment.append(df[i])
            elif category == "food and drink":
                food.append(df[i])
            elif category == "law, govt and politics":
                politics.append(df[i])
            elif category == "sports":
                sports.append(df[i])
    except:
        print (df[i])
        continue

been solved they finally picked up the second time i called thanks for the response jh   to dulles works
need to work on your united flierfriendly program at the very least clue in your flight attendants
space doctors notes and everything and on my first trip they lost my first suitcase among other issues not pleased with united
s new flierfriendly is garbage just had to cry to get attendant to find a place for my medical supplies with limited overhead
epic fail no jetway been here  mins on tarmac
at the gate iad to rdu
thanks for the link now finally arrived in brussels  h after schedule
to operated b er from newark to zurich airport replacing b er between may sep instead till oct avgeek
nd flight in two weeks that you have lost my bag taking my k status to neveragain worstcustomerservice
hi what is the phone number for reservations in venezuela thanks
can i request a ticket change through twitter
ua flight was a nightmare from poor customer service having my confirmed seat given away

In [161]:
pd.DataFrame(entertainment).to_csv("entertainment.csv", header = ['text'])
pd.DataFrame(food).to_csv("food.csv", header = ['text'])
pd.DataFrame(politics).to_csv("politics.csv", header = ['text'])
pd.DataFrame(sports).to_csv("sports.csv", header = ['text'])
pd.DataFrame(health).to_csv("health.csv", header = ['text'])

## Get Popular Topics

In [162]:
#load Wiki-trained LDA model

base = "/Users/prachi.agrawal/Desktop/Fall2018Courses/599Project/wiki/"
from gensim.test.utils import datapath
from gensim import corpora, models
model_name = datapath(base+"model/test")
lda = models.ldamulticore.LdaMulticore.load(model_name)

In [163]:
# load dictionary and corpus from the wiki-trianed model

import gensim
base_md = "/Users/prachi.agrawal/Desktop/Fall2018Courses/599Project/wiki/m_vecs/"
dictionary = gensim.corpora.Dictionary.load_from_text(base_md+'_wordids.txt')
corpus = gensim.corpora.MmCorpus(base_md+'_tfidf.mm')

In [60]:
# print topics (1000) from LDA

for idx, topic in lda.print_topics(-1):
    print('Topic Number: {}\t \nWord Distribution: {}'.format(idx, topic))

Topic Number: 0	 
Word Distribution: 0.014*"berlin" + 0.007*"kensington" + 0.005*"atheism" + 0.005*"atheist" + 0.004*"jargon" + 0.004*"atheists" + 0.004*"fermat" + 0.004*"batavia" + 0.004*"platz" + 0.003*"bandung"
Topic Number: 1	 
Word Distribution: 0.022*"nikki" + 0.016*"lille" + 0.016*"kazan" + 0.014*"streisand" + 0.012*"kiki" + 0.010*"barbra" + 0.009*"wheaton" + 0.008*"inductive" + 0.008*"gorky" + 0.007*"downes"
Topic Number: 2	 
Word Distribution: 0.028*"worn" + 0.017*"dress" + 0.014*"clothing" + 0.014*"stevens" + 0.013*"fabric" + 0.011*"fashion" + 0.010*"silk" + 0.010*"leather" + 0.010*"hat" + 0.009*"garment"
Topic Number: 3	 
Word Distribution: 0.025*"lodge" + 0.024*"crow" + 0.023*"wilkinson" + 0.017*"hutchinson" + 0.015*"masonic" + 0.015*"lodges" + 0.011*"tick" + 0.010*"kurtz" + 0.010*"breaststroke" + 0.010*"valkyrie"
Topic Number: 4	 
Word Distribution: 0.026*"bbc" + 0.013*"presenter" + 0.012*"itv" + 0.008*"radio" + 0.008*"comedy" + 0.008*"programme" + 0.006*"tv" + 0.006*"cele

In [165]:
test_files = ["health.csv", "entertainment.csv", "food.csv", "politics.csv", "sports.csv"]

# for file in test_files:
df = pd.read_csv("health.csv", index_col = None)
test_df = df['text']
words_corpus_test = []

for i in range(0,len(test_df)):
    words_corpus_test.append([word for word in test_df[i].lower().split() if len(word)>3])

print(words_corpus_test)

[['know', 'that', 'suicide', 'second', 'leading', 'cause', 'death', 'among', 'teens'], ['down', 'with', 'yeah', 'know'], ['book', 'seats', 'your', 'flights', 'when', 'them', 'even', 'during', 'check', 'creates', 'much', 'anxiety', 'frustrated'], ['requested', 'window', 'seat', 'confirmed', 'window', 'stuck', 'middle', 'seat', 'good', 'treat', 'silver', 'member'], ['help', 'this', 'fund', 'needs', 'urgent', 'treatment', 'battling', 'cancer', 'could', 'help', 'with', 'flights', 'freyasfund'], ['digging', 'swanky', 'pink', 'mood', 'lighting', 'during', 'flight', 'from', 'just', 'needs', 'cabaret', 'singer', 'think', 'about'], ['help', 'with', 'flights', 'battling', 'cancer', 'needs', 'treatment', 'fund'], ['wondering', 'guys', 'recieved', 'able', 'potentially', 'respond', 'asap'], ['once', 'found', 'problem', 'avoided', 'like', 'plague', 'told', 'find', 'supervisor'], ['ours', 'july', 'have', 'zero', 'excuses', 'this', 'have', 'date', 'system', 'that', 'causes', 'problems', 'like', 'this'

In [166]:
corpus_bow = [dictionary.doc2bow(text) for text in words_corpus_test]

In [90]:
freq_dict = dict()
for i in range(0, len(corpus_bow)):
    print(i)
    count = 0
    for index, score in sorted(lda[corpus_bow[i]], key=lambda tup: -1*tup[1]):
        if index in freq_dict:
            freq_dict[index] += 1
        else:
            freq_dict[index] = 1
        print("\nScore: {}\t \nTopic Number: {}\t \nWord Distribution: {}".format(score, index, lda.print_topic(index, 10)))
        break
    print("\n\n\n")

0

Score: 0.47467878460884094	 
Topic Number: 18	 
Word Distribution: 0.003*"monster" + 0.003*"plot" + 0.003*"creature" + 0.003*"novel" + 0.002*"movie" + 0.002*"characters" + 0.002*"kill" + 0.002*"episode" + 0.002*"finds" + 0.002*"woman"




1

Score: 0.2217264324426651	 
Topic Number: 527	 
Word Distribution: 0.163*"airport" + 0.046*"terminal" + 0.034*"airlines" + 0.033*"flights" + 0.029*"runway" + 0.026*"airports" + 0.026*"passengers" + 0.026*"destinations" + 0.024*"aviation" + 0.023*"aircraft"




2

Score: 0.28327518701553345	 
Topic Number: 441	 
Word Distribution: 0.028*"network" + 0.025*"cable" + 0.022*"wireless" + 0.021*"networks" + 0.019*"internet" + 0.014*"broadband" + 0.013*"communications" + 0.013*"ethernet" + 0.012*"ieee" + 0.012*"telecom"




3

Score: 0.2502500116825104	 
Topic Number: 134	 
Word Distribution: 0.044*"burnett" + 0.030*"lucky" + 0.028*"acm" + 0.028*"waits" + 0.024*"luck" + 0.022*"krauss" + 0.021*"hobart" + 0.020*"fearless" + 0.019*"nellie" + 0.019*"mts"




In [167]:
for k, v in freq_dict.items():
    print("Topic:", k, " Freq:", v)

Topic: 18  Freq: 1
Topic: 527  Freq: 1
Topic: 441  Freq: 1
Topic: 134  Freq: 1
Topic: 277  Freq: 1
Topic: 173  Freq: 1
Topic: 21  Freq: 1
Topic: 540  Freq: 1
Topic: 14  Freq: 2
Topic: 147  Freq: 1
Topic: 198  Freq: 1
Topic: 555  Freq: 1
Topic: 674  Freq: 1
Topic: 469  Freq: 1
Topic: 440  Freq: 1
Topic: 904  Freq: 1


In [168]:
sorted_x = sorted(freq_dict.items(), key=operator.itemgetter(1), reverse = True)

## Final words and phrases 

In [173]:
for i in range(0, 2):
    print("Topic Number: {}\t \nWord Distribution: {}".format(sorted_x[i][0], lda.print_topic(sorted_x[i][0], 10)))
    print("\n")

Topic Number: 14	 
Word Distribution: 0.054*"flight" + 0.031*"aircraft" + 0.027*"crash" + 0.023*"plane" + 0.021*"faa" + 0.020*"robin" + 0.019*"taxi" + 0.019*"aviation" + 0.019*"hood" + 0.018*"cargo"


Topic Number: 18	 
Word Distribution: 0.003*"monster" + 0.003*"plot" + 0.003*"creature" + 0.003*"novel" + 0.002*"movie" + 0.002*"characters" + 0.002*"kill" + 0.002*"episode" + 0.002*"finds" + 0.002*"woman"




## Get Popular Entities and Corresponding sentiment

In [95]:
# Get Entities of each tweet 
# Get overall popular 5 entities with their sentiment

In [117]:
test_df.shape

(17,)

In [171]:
# Named Entity Recognition and Sentiment Analysis

nlp = en_core_web_sm.load()
analyser = SentimentIntensityAnalyzer()

frequent = dict()
for i in range(0,len(test_df)):
    tweet = test_df[i]
    tagged_sent = nltk.pos_tag([word for word in nltk.word_tokenize(tweet)])
    normalized_sent = [w.capitalize() if t in ["NN","NNS"] else w for (w,t) in tagged_sent]
    processed_tweet = ' '.join(normalized_sent)
    doc = nlp(processed_tweet)
    scores = analyser.polarity_scores(tweet)
    for X in doc.ents:
        if X.text in frequent:
            if X.label_ in ["PERSON", "ORG", "GPE"]:
                
                data = (frequent[X.text][0] + 1, frequent[X.text][1] + scores['pos'], frequent[X.text][2] + scores['neg'])
                frequent[X.text] = data
        
        else:
            if X.label_ in ["PERSON", "ORG", "GPE"]:
                frequent[X.text] = (1, scores['pos'], scores['neg'])
  
print(frequent)
#clustering : cluster Ranbir and Ranbir Singh together

{'Suicide': (6, 0.0, 2.472), 'Cause of Death': (6, 0.0, 2.472), 'Anxiety': (2, 0.192, 0.39), 'Seat': (3, 0.282, 0.583), 'Fund': (1, 0.291, 0.263), 'Battling Cancer': (2, 0.425, 0.585), 'Nyc': (2, 0.229, 0.169), 'Problem': (1, 0.106, 0.216), 'Plague': (2, 0.264, 0.615), 'Supervisor': (1, 0.106, 0.216), 'Time': (6, 0.557, 0.42800000000000005), 'Home': (1, 0.088, 0.0), 'Vein Thrombosis': (1, 0.0, 0.0), 'Challenges': (1, 0.127, 0.185), 'Upgrades amp Charges in Order': (1, 0.127, 0.185), 'Dvt': (1, 0.127, 0.185), 'Addtl': (1, 0.127, 0.185), 'Gate Agents': (1, 0.0, 0.227), 'Norm': (1, 0.0, 0.227), 'Difference': (2, 0.384, 0.124), 'Friends': (1, 0.236, 0.124), 'Baggage Handlers': (1, 0.077, 0.222), 'Lungs': (1, 0.077, 0.222), 'Occur': (1, 0.06, 0.236), 'Bag': (5, 0.182, 0.516), 'Delay': (2, 0.06, 0.33099999999999996), 'Appreciate': (1, 0.431, 0.0), 'Wallet': (1, 0.0, 0.337), 'Weather': (1, 0.0, 0.134), 'Airline Maintenance': (1, 0.0, 0.134), 'the Weight Restriction': (1, 0.0, 0.104), 'Cause':

## Final Entities

In [172]:
sorted_frequent_entitiy_list = sorted(frequent.items(), key=operator.itemgetter(0), reverse = True)
for i in range(0, 5):
    print("Entity: {}\t Sentiment: {}".format(sorted_frequent_entitiy_list[i][0], "postive" if sorted_frequent_entitiy_list[i][1][1] > sorted_frequent_entitiy_list[i][1][2] else "negative" ))
    print("\n")

Entity: the Weight Restriction	 Sentiment: negative


Entity: the Phone Rep	 Sentiment: negative


Entity: the Pain Fly	 Sentiment: negative


Entity: s Sch	 Sentiment: postive


Entity: no Info Thismosaicnothappy	 Sentiment: negative


