# Import Relevant Libraries

In [1]:
import json
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize
import datetime
import string
import nltk
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
from math import pi
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer 
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pickle

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

from nintendo.trend_radar_functions import (json_to_df, 
                                            add_time_to_df,
                                            reset_index,
                                           )

from nintendo.data_cleaning import (select_relevant_cols,
                                    filter_lang,
                                    drop_duplicates,
                                    unique_hashtag_list,
                                    unique_link_list,
                                    unique_ats_list,
                                    remove_hash_link_at,
                                    strip_punctuation,
                                    remove_punctuation,
                                    make_lower_case,
                                    get_wordnet_pos,
                                    lemmatize_text,
                                    remove_stop_words,
                                    remove_just_hash,
                                    vader_sentiment,
                                   )

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/laurashummonmaass/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/laurashummonmaass/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/laurashummonmaass/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# Load Tweet Data

#### Please see Readme for instructions on how to acquire this data.

In [2]:
tweets = []
for line in open('NintendoTweets.json', 'r'):
    if len(line)>1:
        tweets.append(json.loads(line))
#tweets

In [3]:
len(tweets)

104695

# Data Cleaning

Flatten JSON File for embedded dictionaries and store as DF.

In [4]:
tweets_norm = json_to_df(tweets)

Filter for English only (also uses only relevant columns)  
Add a .time. column showing H:M:S    
Remove any duplicate rows.

In [5]:
df = filter_lang(tweets_norm)
df = add_time_to_df(df)
df = drop_duplicates(df)

### Explore the data

In [6]:
len(df)

71378

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71378 entries, 0 to 104694
Data columns (total 6 columns):
user.id         71378 non-null float64
text            71378 non-null object
lang            71378 non-null object
created_at      71378 non-null object
timestamp_ms    71378 non-null object
.time.          71378 non-null object
dtypes: float64(1), object(5)
memory usage: 3.8+ MB


In [8]:
df.isna().sum()

user.id         0
text            0
lang            0
created_at      0
timestamp_ms    0
.time.          0
dtype: int64

In [9]:
print(df['.time.'].min())
print(df['.time.'].max())

16:00:24
17:00:23


### Text Cleaning

Remove any words starting with: #, @, or http and put cleaned text into new 'text2' column.

In [10]:
remove_hash_link_at(df)

Remove punctuation & stop words, make text all lower case, lemmatize all words.

In [11]:
remove_punctuation(df)

In [12]:
make_lower_case(df)

In [13]:
lemmatize_text(df)

In [14]:
remove_stop_words(df)

Remove http again (some links may have had symbol infront of it and not been removed the first time)

In [15]:
remove_just_hash(df)

Final DF:

In [16]:
df = reset_index(df)

#### Pickle the DF:

In [None]:
with open('cleaned_twitter_df2.pkl', 'wb') as f:
    pickle.dump(df, f)

For future reference... to import the pickled DF back:

In [None]:
# import pickle

# with open('cleaned_twitter_df2.pkl', 'rb') as f:
#     df = pickle.load(f)

# END Data Cleaning

# Vader NLP

In [18]:
vader_output = vader_sentiment(df)

In [19]:
vader_output[0:6] # view first 5

[{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0},
 {'neg': 0.0, 'neu': 0.9, 'pos': 0.1, 'compound': 0.1406},
 {'neg': 0.0, 'neu': 0.748, 'pos': 0.252, 'compound': 0.5229},
 {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0},
 {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0},
 {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}]

#### Pickle Vader Output

In [None]:
import pickle

with open('vader_output.pkl', 'wb') as f:
    pickle.dump(vader_output, f)

For future reference... to import the pickled Vader Sentiment back:

In [None]:
# import pickle

# with open('vader_output.pkl', 'rb') as f:
#     vader_output = pickle.load(f)

# END Vader NLP

# EVERYTHING HERE ON IS CLEANED UP AND LOCATED IN 'final_notebook_trends_and_radars.ipynb'

# IGNORE EVERYTHING BELOW HERE

# Create a Words DF

#### Create Matrix 

In [None]:
# total_words = []
# for i in df['text2']:
#     words = i.split()
#     for j in words:
#         total_words.append(j)

# unique_words = [] 
# for i in total_words:
#     if not i in unique_words:
#         unique_words.append(i)
# unique_words

In [None]:
# print(len(total_words))
# print(len(unique_words))

In [None]:
# vectorizer = CountVectorizer(vocabulary=unique_words)
# vectorized_words = vectorizer.transform(df['text2'])

#### View the matrix:

In [None]:
# print(vectorized_words.shape) # (tweet_count, unique_word_count)
# word_array = vectorized_words.toarray()
# word_array

#### Add matrix to a dataframe

In [None]:
# #vectorized_words is a matrix (71378, 14653)
# #unique_words is a list (14653)
# #df.index has len of (71378)

# matrix_df = pd.DataFrame(word_array, columns=unique_words, index=df.index) 
# matrix_df.head() 

Rename columns in df (text, lang, time) so that joining matrix has not conflicts.

In [None]:
# df = df.rename(index=str, columns={"text": ".text.", "lang": ".lang.", "time": ".time."})
# df.head().T

#### Combine Matrix DF and Original DF  
* Need to remove original index and add new index column in both DF and Matrix_DF

In [None]:
# df = df.drop('index', 1)
# df = df.reset_index()
# df = df.rename(index=str, columns={'index': 'df_index'})
# df.head()

In [None]:
# matrix_df = matrix_df.reset_index()
# matrix_df = matrix_df.rename(index=str, columns={'index': 'matrix_df_index'})
# matrix_df.head()

Add words to original df.

In [None]:
# df_words = df.join(matrix_df)
# df_words.head()

# END Create Words DF

# Trend Lines
## Grouped by every 5 seconds

#### Finding the Positive & Negative (& Compound) Scores by Second (for time series)

In [None]:
# import time 
    
# df['.time.'] = df['timestamp_ms'].apply(lambda x: time.strftime('%H:%M:%S', time.gmtime(int(x)/1000)))
# df.head().T

In [None]:
# unique_seconds = []
# for times in df['.time.']:
#     all_times = []
#     all_times.append(times)
#     for i in all_times:
#         if not i in unique_seconds:
#             unique_seconds.append(i)
# unique_seconds

#### Add a column for each 5 second interval

In [None]:
# five_seconds = []
# for second in unique_seconds:
#     if len(five_seconds)==0:
#         five_seconds.append(1)
#     elif len(five_seconds)%5 != 0:
#         five_seconds.append(five_seconds[-1])
#     else:
#         five_seconds.append(five_seconds[-1]+1)

In [None]:
# seconds_dict = dict(zip(unique_seconds, five_seconds))

In [None]:
#df['five_seconds'] = df['.time.'].map(seconds_dict)

#### Trend for sum of 5 seconds

In [None]:
# five_sum_df = df.groupby('five_seconds').sum()
# five_sum_df = five_sum_df.reset_index()
# five_sum_df.head()

In [None]:
# plt.plot(five_sum_df['five_seconds'], five_sum_df['pos'], color='g')
# plt.plot(five_sum_df['five_seconds'], five_sum_df['neg'], color='orange')
# plt.xlabel('Every 5 Seconds')
# plt.ylabel('Sentiment')
# plt.title('Nintendo E3 Twitter Sentiments')
# plt.show()

#### Trend for mean of 5 seconds

In [None]:
# five_mean_df = df.groupby('five_seconds').mean()
# five_mean_df = five_mean_df.reset_index()
# five_mean_df.head()

In [None]:
# plt.plot(five_mean_df['five_seconds'], five_mean_df['pos'], color='g')
# plt.plot(five_mean_df['five_seconds'], five_mean_df['neg'], color='orange')
# plt.xlabel('Every 5 Seconds')
# plt.ylabel('Sentiment')
# plt.title('Nintendo E3 Twitter Sentiments')
# plt.show()

# END Trend Lines

# Major Announcements

### Find Major Announcement Points & the Specific Words that Occured the Most
May end up not using and using only 5 second intervals instead... can label 5 second intervals as specific topics

In [None]:
#df.loc[df['five_seconds'] == 81] .head(3)

In [None]:
# mario_party = list(range(81,101))     #81 to 100
# smash_brothers = list(range(210,517)) #210 to 516
# end = list(range(517, 601))           #517 to 600
# #none for all others

# df['five_seconds'].max()

In [None]:
# all_five_seconds = list(range(1, 721))

In [None]:
# labels = []
# for i in all_five_seconds:
#     if i in mario_party:
#         labels.append('mario_party')
#     elif i in smash_brothers:
#         labels.append('smash_brothers')
#     elif i in end:
#         labels.append('end')
#     else:
#         labels.append('none')
# len(labels)

Dictionary that maps the labels to the appropriate 5 seconds.

In [None]:
#labels_dict = dict(zip(all_five_seconds, labels))

In [None]:
#df['.announcements.'] = df['five_seconds'].map(labels_dict)
#df.head()

# END Major Announcements

##### Note: May not use... may instead create a radar plot for each SECOND and create a general label for all seconds in the hour.

# Word Dictionary

In [None]:
# df_words.head(3)

In [None]:
# df_words['five_seconds'] = df_words['.time.'].map(seconds_dict)
# df_words['.announcements.'] = df_words['five_seconds'].map(labels_dict)

# def create_dictionary_for_specified_time (time=1, which_five='top'): # choose either 'top' or 'bottom'
#     df_filtered_by_seconds = df_words.loc[(df_words['five_seconds']== time)]  #| (df_words['five_seconds']== 2)]
#     dict_by_seconds = df_filtered_by_seconds.to_dict(orient='index')
    
#     # create a cleaned dictionary for each word labeled by tweet number
#     list_of_word_dicts = []
#     for key1, val in dict_by_seconds.items():
#         u_words = val['text2'].split(' ')
#         neg = val['neg']
#         compound = val['compound']
#         neu = val['neu']
#         pos = val['pos']
#         for key, value in val.items():
#             try:
#                 value = float(value)
#                 if (value > 0) & (key in u_words) :
#                     list_of_word_dicts.append({ 
#                             'tweet_no': key1,
#                             key:{'count': 1, 'compound_sum': compound, 'neg_sum': neg, 
#                                  'neu_sum': neu, 'pos_sum': pos},
#                                                 })
#             except:
#                 pass
    
#     # remove duplicate words that appear several times in one tweet
#     no_dupl_list_of_word_dicts = [i for n, i in enumerate(list_of_word_dicts) 
#                                   if i not in list_of_word_dicts[n + 1:]]
    
#     return_dict = {}
#     for i in no_dupl_list_of_word_dicts:
#         for key, val in i.items():
#             if key is not 'tweet_no':
#                 if key not in return_dict.keys():
#                     return_dict.update({key : val})
#                 else:
#                     return_dict[key]['count'] += val['count']
#                     return_dict[key]['compound_sum'] += val['compound_sum']
#                     return_dict[key]['neg_sum'] += val['neg_sum']
#                     return_dict[key]['neu_sum'] += val['neu_sum']
#                     return_dict[key]['pos_sum'] += val['pos_sum']
                    
#     compound_dict = {}
#     for key, val in return_dict.items():
#         #print(key, val)
#         #compound_dict.update({key: val['compound_sum'] })
#         compound_dict[key] = val['compound_sum']
    
#     sorted_compound_dict = sorted(compound_dict.items(), key=lambda kv: kv[1])
    
#     if which_five == 'top':
#         #five_words = dict(sorted_compound_dict[0:5])
#         five_words = dict(sorted_compound_dict[-5:])
#     elif which_five == 'bottom': 
#         #five_words = dict(sorted_compound_dict[-5:])
#         five_words = dict(sorted_compound_dict[0:5])
#     else:
#         "Please choose either 'top' or 'bottom'."

#     return five_words

In [None]:
# testing = create_dictionary_for_specified_time(2)
# testing

In [None]:
# bottom_df = pd.Series(testing[0])
# bottom_df = pd.DataFrame(bottom_df)
# bottom_df = bottom_df.T
# bottom_df['group'] = 'A'

# top_df = pd.Series(testing[1])
# top_df = pd.DataFrame(top_df)
# top_df = top_df.T
# top_df['group'] = 'A'

# END Word Dictionary

# Radar Plots

In [None]:
# #NOT WORKING YET
# def radar_plot_creator():
#     bottom_df = pd.Series(testing[0])
#     bottom_df = pd.DataFrame(bottom_df)
#     bottom_df = bottom_df.T
#     bottom_df['group'] = 'A'
#     top_df = pd.Series(testing[1])
#     top_df = pd.DataFrame(top_df)
#     top_df = top_df.T
#     top_df['group'] = 'A'
    
#    # Set data
#     radar_df_test = bottom_df

#     # number of variable
#     categories=list(radar_df_test)[1:]
#     N = len(categories)

#     # We are going to plot the first line of the data frame.
#     # But we need to repeat the first value to close the circular graph:
#     values=radar_df_test.loc[0].drop('group').values.flatten().tolist()
#     values += values[:1]
#     values

#     # What will be the angle of each axis in the plot? (we divide the plot / number of variable)
#     angles = [n / float(N) * 2 * pi for n in range(N)]
#     angles += angles[:1]

#     # Initialise the spider plot
#     ax = plt.subplot(111, polar=True)

#     # Draw one axe per variable + add labels labels yet
#     plt.xticks(angles[:-1], categories, color='grey', size=8)

#     # Draw ylabels
#     ax.set_rlabel_position(0)
#     plt.yticks([-3,-2,-1,0,1,2,3], ["","","", 0, "", "", ""], color="grey", size=7)
#     plt.ylim(-3,3)

#     # Plot data
#     ax.plot(angles, values, linewidth=1, linestyle='solid')

#     # Fill area
#     testing_radar = ax.fill(angles, values, 'b', alpha=0.1);  
    
#     return testing_radar
# radar_plot_creator

THIS ONE WORKS.... But not in a function:

In [None]:
# # Set data
# radar_df_test = bottom_df
 
# # number of variable
# categories=list(radar_df_test)[1:]
# N = len(categories)
 
# # We are going to plot the first line of the data frame.
# # But we need to repeat the first value to close the circular graph:
# values=radar_df_test.loc[0].drop('group').values.flatten().tolist()
# values += values[:1]
# values
 
# # What will be the angle of each axis in the plot? (we divide the plot / number of variable)
# angles = [n / float(N) * 2 * pi for n in range(N)]
# angles += angles[:1]
 
# # Initialise the spider plot
# ax = plt.subplot(111, polar=True)
 
# # Draw one axe per variable + add labels labels yet
# plt.xticks(angles[:-1], categories, color='grey', size=8)
 
# # Draw ylabels
# ax.set_rlabel_position(0)
# plt.yticks([-3,-2,-1,0,1,2,3], ["","","", 0, "", "", ""], color="grey", size=7)
# plt.ylim(-3,3)
 
# # Plot data
# ax.plot(angles, values, linewidth=1, linestyle='solid')
 
# # Fill area
# ax.fill(angles, values, 'b', alpha=0.1);


# END Radar Plots