In [1]:
import os 
import json
import requests
import textblob
import time

import numpy as np
import pandas as pd
import keras.preprocessing.text as kpt
import matplotlib.pyplot as plt

from keras.models import load_model
from arcgis.gis import GIS
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from textblob.sentiments import NaiveBayesAnalyzer
from textblob.np_extractors import ConllExtractor
from textblob import TextBlob

Using TensorFlow backend.


In [2]:
csv_path = r'C:\Users\jame9353\Documents\GitHub\NLP-Engine-Integration\Demo Data\Baltimore Riots Tweets\baltimore_twitter.csv'
model = load_model('models/Twitter_SA_Model.h5')

In [3]:
def netowlCurl(inFile, outPath, outExtension):
    headers = {
    'accept': 'application/rdf+xml',
    'Authorization': 'netowl ff5e6185-5d63-459b-9765-4ebb905affc8',
    }
    
    
    headers['Content-Type'] = 'text/plain'
        
    params = (
        ('language', 'english'),
    )
    
    data = open(inFile, 'rb').read()
    response = requests.post('https://api.netowl.com/api/v2/_process', headers=headers, params=params, data=data, verify=False)
    r = response.text
    outPath = outPath
    fileName = os.path.split(d)[1]
    if os.path.exists(outPath) == False:
        os.mkdir(outPath, mode=0o777,)
    outFile = os.path.join(outPath, fileName + outExtension)
    open(outFile, "w", encoding="utf-8").write(r)

In [4]:
with open('dictionary/dictionary.json', 'r') as dictionary_file:
    dictionary = json.load(dictionary_file)

In [5]:
tokenizer = Tokenizer(num_words=3000)

In [6]:
def convert_text_to_index_array(text):
    words = kpt.text_to_word_sequence(text)
    wordIndices = []
    for word in words:
        if word in dictionary:
            wordIndices.append(dictionary[word])
        else:
            pass
            #print("'%s' not in training corpus; ignoring." %(word))
    return wordIndices

In [7]:
def Sentiment(tweet_text):
    labels = ['positive', 'negative']
    testArr = convert_text_to_index_array(tweet_text)
    twt = tokenizer.sequences_to_matrix([testArr], mode='binary')
    twt = pad_sequences(twt, maxlen=86, dtype='int32', padding='post', truncating='post', value=0)
    sentiment = model.predict(twt)
    accuracy = sentiment[0][np.argmax(sentiment)] * 100
    tweetSent = labels[np.argmax(sentiment)]
    return tweetSent, accuracy

In [8]:
def calculate_sentiment(object_id, text_to_analyze):
    sent_sp = TextBlob(text_to_analyze)
    subjectivity = sent_sp.sentiment.subjectivity
    polarity = sent_sp.sentiment.polarity
    sentiment_tf = Sentiment(text_to_analyze)
    classification_tf = sentiment_tf[0]
    if classification_tf == 'positive':
        classification_num = 1
    else:
        classification_num = 0
    accuracy_tf = sentiment_tf[1]
    text_sentiment = [object_id, subjectivity, polarity, classification_tf, accuracy_tf, classification_num]
    return text_sentiment

In [10]:
df = pd.read_csv(csv_path)
df.dropna()
text_df = df[['OBJECTID', 'text']]

In [11]:
df.head()

Unnamed: 0,OBJECTID,lat,long,dtg,user_name,user_id,text
0,1,39.274819,-76.608696,Mon Apr 27 23:00:57 +0000 2015,PandaMc8,388008100.0,WTFFFFFF https://t.co/2DT5PxqOc2
1,2,39.292146,-76.567825,Mon Apr 27 23:01:15 +0000 2015,okaykerra,351905900.0,Pretty Rick been everywhere and ain't been ain...
2,3,39.293876,-76.682365,Mon Apr 27 23:01:41 +0000 2015,letgoletkarma,44980940.0,I'm filing exempt tomorrow
3,4,39.309108,-76.666054,Mon Apr 27 23:01:47 +0000 2015,PrettyMoee,265631400.0,I got endless videos
4,5,39.281066,-76.631622,Mon Apr 27 23:02:05 +0000 2015,khyona_,2157380000.0,Omg they mace the man


In [None]:
sentiment_list =[]

start_time = time.time()
print(start_time)

for row in text_df.iterrows():
    oid = row[1]['oid']
    text = row[1]['text']
    if text != None:
        try:
            classified_text = calculate_sentiment(oid, text)
            sentiment_list.append(classified_text)
            #print(classified_text)
        except:
            print("Error on oid " + str(oid))
    
end_time = time.time()
print(end_time - start_time)

In [None]:
sentiment_columns = ['oid', 'Subjectivity', 'Polarity', 'Sentiment', 'Accuracy', 'Classification']
sentiment_df = pd.DataFrame(sentiment_list, columns=sentiment_columns)
sentiment_df.head()

In [None]:
merged_df = pd.merge(df, sentiment_df, on='oid')
merged_df.head()

In [None]:
import re

mentions = []

for row in merged_df.iterrows():
    oid = row[1]['oid']
    text = row[1]['text']
    user = row[1]['user_name']
    match = re.findall(r'@(?i)[a-z0-9_]+', text)
    if len(match) > 0:
        for handle in match:
            mentions.append(["@" + user, handle])

In [None]:
len(mentions)

In [None]:
mentions_columns = ['User', 'Mentioned']
mentions_df = pd.DataFrame(mentions, columns=mentions_columns)
mentions_df.head()

In [None]:
merged_df.to_csv('output/SentimentData.csv')
mentions_df.to_csv('output/TwitterMentions.csv')

In [None]:
gis = GIS("https://esridistributor.maps.arcgis.com", "james_jones_ngse")
map = gis.map("Baltimore")

In [None]:
from arcgis.features import SpatialDataFrame

spatial_df = SpatialDataFrame.from_xy(merged_df, 'lon', 'lat', sr=4326)

In [None]:
plt.scatter(spatial_df.Polarity, merged_df.Subjectivity, s=100, c=merged_df.Classification)

In [None]:
neighborhoods = SpatialDataFrame.from_featureclass(filename = r"C:\Users\jame9353\Documents\GitHub\NLP-Engine-Integration\Demo Data\Baltimore_neighborhoods\nhood_2010.shp")

In [None]:
tweets_fc = gis.content.get('8dc29960489a4d4ea98972e3d0c53e4d')
neighborhoods_fc = gis.content.get('225e71a84e554d319224baae9708e1ee')
map.add_layer(neighborhoods_fc)
map.add_layer(tweets_fc)
map

In [None]:
import arcgis

aggregrate_neighborhoods = arcgis.features.analysis.aggregate_points(tweets_fc, neighborhoods_fc, 
                                                                     keep_boundaries_with_no_points=True)

In [None]:
aggr_neigh_df = SpatialDataFrame.from_layer(aggregrate_neighborhoods)
aggr_neigh_df

In [None]:
map1 = gis.map("Baltimore")
map1.add_layer(aggregrate_neighborhoods, { "renderer": "ClassedSizeRenderer", "field_name":"Count"})
map1

In [None]:
hot_spots = arcgis.features.analyze_patterns.find_hot_spots(tweets_fc, analysis_field='Polarity')

In [None]:
map2 = gis.map("Baltimore")
map2.add_layer(aggregrate_neighborhoods)
map2

In [None]:
sa = TextBlob(text, analyzer=NaiveBayesAnalyzer())
classification_nba = sa.sentiment[0]
p_pos = sa.sentiment[1]
p_neg = sa.sentiment[2]

extractor = ConllExtractor()
np_extract = TextBlob(text, np_extractor=extractor)
np_list = [name for name in np_extract]

In [None]:
    oid = row[1]['oid']
    lat = row[1]['lat']
    lon = row[1]['lon']
    date = row[1]['dtg']
    user = row[1]['user_name']
    text = row[1]['text']