# Analyze Baltimore Riot Twitter data

This process takes a subset of Twitter data related to the Baltimore Riots and conducts analysis on the tweet text to determine if the areas most affected by the Riots can be recreated.  This analysis uses a combination of a trained Long Short Term Memory (LSTM) Recurrent Neural Network (RNN) and Natural Language Processing Techniques.  

The first step is to import the necessary Python Packages.  

In [1]:
import os 
import json
import requests
import textblob
import time
import tweepy
import re

import numpy as np
import pandas as pd
import keras.preprocessing.text as kpt
import matplotlib.pyplot as plt

from keras.models import load_model
from arcgis.gis import GIS
from arcgis.features import SpatialDataFrame
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from textblob.sentiments import NaiveBayesAnalyzer
from textblob.np_extractors import ConllExtractor
from textblob import TextBlob
from collections import Counter
from operator import itemgetter

Using TensorFlow backend.


In [2]:
consumer_key = '6gokUJ7gZ3ixFNkIRUjxL7Xwf'
consumer_secret = 'Us8FgUedJ610MDlH4ZK0wga1AtY4wNALQdHf50g3pj7Lm3IWAG'
access_token ='542866005-43KxBY08C7knS8VYNAycISB3AyqYmONzt2IhWk3m'
access_token_secret = 'YZ8oJWoyR3SgbIfo1I6PJV5wFbMJSVjIukCwiNPryjXzM'

In [3]:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)

In [4]:
csv_path = r'Baltimore Riots Tweets/baltimore_twitter.csv'
model = load_model('models/Twitter_SA_Model.h5')

In [5]:
with open('dictionary/dictionary.json', 'r') as dictionary_file:
    dictionary = json.load(dictionary_file)

In [6]:
tokenizer = Tokenizer(num_words=20000)

In [7]:
def convert_text_to_index_array(text):
    words = kpt.text_to_word_sequence(text)
    wordIndices = []
    for word in words:
        if word in dictionary:
            wordIndices.append(dictionary[word])
        else:
            pass
    return wordIndices

In [8]:
def Sentiment(tweet_text):
    labels = ['positive', 'negative']
    testArr = convert_text_to_index_array(tweet_text)
    twt = tokenizer.sequences_to_matrix([testArr], mode='binary')
    twt = pad_sequences(twt, maxlen=86, dtype='int32', padding='post', truncating='post', value=0)
    sentiment = model.predict(twt)
    accuracy = sentiment[0][np.argmax(sentiment)] * 100
    tweetSent = labels[np.argmax(sentiment)]
    return tweetSent, accuracy

In [9]:
def calculate_sentiment(object_id, text_to_analyze):
    sent_sp = TextBlob(text_to_analyze)
    #nba_sent = TextBlob(text_to_analyze, analyzer=NaiveBayesAnalyzer())
    subjectivity = sent_sp.sentiment.subjectivity
    polarity = sent_sp.sentiment.polarity
    sentiment_tf = Sentiment(text_to_analyze)
    classification_tf = sentiment_tf[0]
    if classification_tf == 'positive':
        classification_num = 1
    else:
        classification_num = 0
    accuracy_tf = sentiment_tf[1]
    nba_sentiment = ""
    nouns = []
    verbs = []
    for part in sent_sp.tags:
        if part[1].startswith("V"):
            verbs.append(part[0])
        elif part[1].startswith("N"):
            nouns.append(part[0])
        
    for noun in sent_sp.noun_phrases:
        if noun not in nouns:
            nouns.append(noun)
    
    return object_id, subjectivity, polarity, classification_tf, accuracy_tf, classification_num, nouns, verbs, nba_sentiment

In [10]:
df = pd.read_csv(csv_path)
df.dropna()
text_df = df[['OBJECTID', 'text']]

In [11]:
df.head()

Unnamed: 0,OBJECTID,lat,long,dtg,user_name,user_id,text
0,1,39.274819,-76.608696,Mon Apr 27 23:00:57 +0000 2015,PandaMc8,388008100.0,WTFFFFFF https://t.co/2DT5PxqOc2
1,2,39.292146,-76.567825,Mon Apr 27 23:01:15 +0000 2015,okaykerra,351905900.0,Pretty Rick been everywhere and ain't been ain...
2,3,39.293876,-76.682365,Mon Apr 27 23:01:41 +0000 2015,letgoletkarma,44980940.0,I'm filing exempt tomorrow
3,4,39.309108,-76.666054,Mon Apr 27 23:01:47 +0000 2015,PrettyMoee,265631400.0,I got endless videos
4,5,39.281066,-76.631622,Mon Apr 27 23:02:05 +0000 2015,khyona_,2157380000.0,Omg they mace the man


In [12]:
mentions = []
mentions_dict = {}
hashtags = {}

for row in df.iterrows():
    oid = row[1]['OBJECTID']
    text = row[1]['text']
    user = row[1]['user_name']
    match = re.findall(r'@(?i)[a-z0-9_]+', text)
    if len(match) > 0:
        mentions_dict[oid] = match
        for handle in match:
            mentions.append(["@" + user, handle])
    hash_match = re.findall(r'#(?i)[a-z0-9_]+', text)
    if len(hash_match) > 0:
        hashtags[oid] = hash_match

  if __name__ == '__main__':
  


In [13]:
print(len(mentions))
print(len(hashtags))

7016
4338


In [14]:
mentions_columns = ['User', 'Mentioned']
mentions_df = pd.DataFrame(mentions, columns=mentions_columns)
mentions_df.head()

Unnamed: 0,User,Mentioned
0,@lizbreaux,@EternalWeather1
1,@latisha_92,@TrinaBraxton
2,@HotBoy_Gotti,@im_taedoe_bitch
3,@latisha_92,@towandabraxton
4,@HousingWatchMD,@mbta535


In [15]:
mentions_dict

{'6': ['@EternalWeather1'],
 '10': ['@TrinaBraxton'],
 '11': ['@im_taedoe_bitch'],
 '12': ['@towandabraxton'],
 '17': ['@mbta535',
  '@JacobGaffney',
  '@mike_ciklin',
  '@Fanniegate101',
  '@aspit'],
 '24': ['@mbta535',
  '@JacobGaffney',
  '@mike_ciklin',
  '@Fanniegate101',
  '@aspit'],
 '27': ['@mike_ciklin',
  '@mbta535',
  '@JacobGaffney',
  '@Fanniegate101',
  '@aspit'],
 '31': ['@GeoffLRamsey'],
 '33': ['@wbaltv11'],
 '41': ['@_MoneyteamBlack'],
 '43': ['@suckafree_jazz'],
 '44': ['@NeddraASmith',
  '@SChurch00',
  '@MayorSRB',
  '@audiojoe13',
  '@HeathSandell'],
 '48': ['@NeddraASmith',
  '@SChurch00',
  '@MayorSRB',
  '@audiojoe13',
  '@HeathSandell'],
 '50': ['@DanEwald'],
 '52': ['@mbta535',
  '@JacobGaffney',
  '@mike_ciklin',
  '@Fanniegate101',
  '@aspit'],
 '59': ['@aminah_brown', '@_waynelee__'],
 '68': ['@DanEwald'],
 '69': ['@Bmore_jolley33', '@microsoft42'],
 '72': ['@wbaltv11'],
 '85': ['@BreweryOmmegang'],
 '91': ['@Ganja_ExcitesMe'],
 '94': ['@microsoft42'],
 '1

In [16]:
mentions_df.to_csv('output/TwitterMentions.csv')

In [17]:
df['Mentions'] = df['OBJECTID'].map(mentions_dict)

In [18]:
df['Hashtags'] = df['OBJECTID'].map(hashtags)

In [19]:
df.head(10)

Unnamed: 0,OBJECTID,lat,long,dtg,user_name,user_id,text,Mentions,Hashtags
0,1,39.274819,-76.608696,Mon Apr 27 23:00:57 +0000 2015,PandaMc8,388008100.0,WTFFFFFF https://t.co/2DT5PxqOc2,,
1,2,39.292146,-76.567825,Mon Apr 27 23:01:15 +0000 2015,okaykerra,351905900.0,Pretty Rick been everywhere and ain't been ain...,,
2,3,39.293876,-76.682365,Mon Apr 27 23:01:41 +0000 2015,letgoletkarma,44980940.0,I'm filing exempt tomorrow,,
3,4,39.309108,-76.666054,Mon Apr 27 23:01:47 +0000 2015,PrettyMoee,265631400.0,I got endless videos,,
4,5,39.281066,-76.631622,Mon Apr 27 23:02:05 +0000 2015,khyona_,2157380000.0,Omg they mace the man,,
5,6,39.281348,-76.622841,Mon Apr 27 23:02:06 +0000 2015,lizbreaux,32687260.0,"@EternalWeather1 yes, thanks for checking. Jus...",[@EternalWeather1],
6,7,39.317959,-76.597527,Mon Apr 27 23:02:20 +0000 2015,LaySoSilly_,497954400.0,Wish I had someone to text or talk to,,
7,8,39.283331,-76.683991,Mon Apr 27 23:02:23 +0000 2015,1rockstarjay,133151800.0,T ' !,,
8,9,39.273615,-76.689659,Mon Apr 27 23:02:32 +0000 2015,mmccxxii,1636984000.0,why county schools getting out early ?,,
9,10,39.321848,-76.595459,Mon Apr 27 23:02:42 +0000 2015,latisha_92,477367100.0,@TrinaBraxton Please Pray For Baltimore,[@TrinaBraxton],


In [20]:
df['text'] = df['text'].apply(lambda x: x.lower())
df['text'] = df['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

In [21]:
df.head()

Unnamed: 0,OBJECTID,lat,long,dtg,user_name,user_id,text,Mentions,Hashtags
0,1,39.274819,-76.608696,Mon Apr 27 23:00:57 +0000 2015,PandaMc8,388008100.0,wtffffff httpstco2dt5pxqoc2,,
1,2,39.292146,-76.567825,Mon Apr 27 23:01:15 +0000 2015,okaykerra,351905900.0,pretty rick been everywhere and aint been aint...,,
2,3,39.293876,-76.682365,Mon Apr 27 23:01:41 +0000 2015,letgoletkarma,44980940.0,im filing exempt tomorrow,,
3,4,39.309108,-76.666054,Mon Apr 27 23:01:47 +0000 2015,PrettyMoee,265631400.0,i got endless videos,,
4,5,39.281066,-76.631622,Mon Apr 27 23:02:05 +0000 2015,khyona_,2157380000.0,omg they mace the man,,


In [22]:
df_shape = df.shape
print("Total Rows:  " + repr(df_shape[0]))
print("Total Columns:  " + repr(df_shape[1]))

Total Rows:  23137
Total Columns:  9


In [23]:
subj_dict = {}
pol_dict = {}
class_tf_dict = {}
acc_tf_dict = {}
class_num_dict = {}
nouns_dict = {}
verbs_dict = {}
nba_dict = {}

start_time = time.time()
print(start_time)

errors = 0
count = 0

for row in text_df.iterrows():
    oid = row[1]['OBJECTID']
    text = row[1]['text']
    if text != None:
        try:
            oid, subj, pol, class_tf, acc_tf, class_num, nouns, verbs, nba = calculate_sentiment(oid, text)
            subj_dict[oid] = subj
            pol_dict[oid] = pol
            class_tf_dict[oid] = class_tf
            acc_tf_dict[oid] = acc_tf
            class_num_dict[oid] = class_num
            nouns_dict[oid] = nouns
            verbs_dict[oid] = verbs
            nba_dict[oid] = nba
            if count % 100 == 0:
                print(oid, subj, pol, class_tf, acc_tf, class_num, nouns, verbs, nba)
        except:
            errors +=1
            print("Error on oid " + str(oid))
        count +=1
        
        
    
end_time = time.time()
print(end_time - start_time)
print("Process completed with an error percentage of " + repr((errors/df_shape[0]) * 100))

1543959379.4113915
1 0.0 0.0 negative 61.18459701538086 0 ['WTFFFFFF', 'https', '//t.co/2DT5PxqOc2', 'wtffffff'] [] 
101 0.5 -0.375 negative 61.185431480407715 0 ['bout', 'da', 'bookins', 'tonight', 'talkin bout', 'da bookins rey'] ['rey', 'be'] 
201 0.0 0.0 negative 63.83033990859985 0 ['Baltimore', 'home', 'BaltimoreRescueBus', 'location', 'ride', 'home', 'baltimore', 'tweet #', 'baltimorerescuebus', 'ride home'] ['are', 'stuck', 'trying', 'get'] 
301 0.0 0.0 negative 61.17587685585022 0 ['Dis', 'Bitch', 'Wan', 'Speak', 'Her', 'Lor', 'Ass', 'Was', 'M.I.A', 'dis bitch wan', 'speak', 'lor ass was m.i.a'] [] 
401 0.0 0.0 negative 61.173540353775024 0 ['@', 'qveen__lee', '@ qveen__lee'] ['follow'] 
501 0.0 0.0 negative 61.18459701538086 0 ['Phone', 'phone'] [] 
601 0.0 0.0 negative 61.18062138557434 0 ['MustBE_maya', 'girl', 'house', 'mustbe_maya'] ['getting'] 
701 0.0 0.0 negative 61.18459701538086 0 ['peace', 'National', 'Aquarium', 'https', '//t.co/G65gxmztgb', 'peace # prayingforbalt

5301 0.125 0.0 negative 61.19639873504639 0 ['team', 'baltimore', 'ourcity', '@', 'Baltimore', 'https', '//t.co/YV6hnOswPO', 'team # baltimore', '# smalltimore # ourcity # ourpeople # onelove @'] ["'re", 'is'] 
5401 0.3 -0.2 negative 61.183249950408936 0 ['Graffiti', 'Removal', 'request', 'Android', 'S', 'President', 'St', 'Baltimore', 'http', '//t.co/FC1yeGmZj2', 'Graffiti', 'b', 'stickers', 'city', 'sign', 'opened graffiti removal', 'android', 's president', 'st baltimore', 'graffiti', 'b stickers', 'green city'] ['Opened'] 
5501 0.0 0.0 negative 61.19234561920166 0 ['hour', 'nap', 'hour nap'] ['need'] 
5600 0.8 -0.2 negative 61.18472218513489 0 ['@', 'DCDouchbag', 'BMorehonest', 'dcdouchbag', 'talks shit', 'bmorehonest'] ['talks', 'do', 'understand'] 
5700 0.25 0.390625 negative 64.76348638534546 0 ['spirit', 'Oriole', 'Park', 'Camden', 'Yards', '@', 'mlb', 'Baltimore', 'MD', 'https', '//t.co/JhPwAirK2O', 'ca', 'oriole', 'camden', '@ mlb', 'baltimore', 'md'] ['Ca', 'be', 'supporting

10100 0.15000000000000002 -0.2 negative 61.183249950408936 0 ['Complaint', 'request', 'Graves', 'St', 'http', '//t.co/S8X0huYpKW', 'Gone', 'arrival', 'closed parking complaint', 'graves st', 'gone'] ['Closed', 'Parking'] 
10200 0.40750000000000003 0.21000000000000002 negative 61.18001341819763 0 ['Perfect', 'beer', 'summertime', 'Duckpin', 'Pale', 'Ale', '@', 'Aloha', 'Liquors', 'http', '//t.co/38zk0Map5s', 'perfect', 'summertime ...', 'drinking', 'duckpin pale ale', 'aloha liquors'] ['is', 'starting', 'Drinking'] 
10300 0.7833333333333333 0.6333333333333334 negative 61.21233105659485 0 ['Derrick', 'Rose', 'Always', 'Going', 'B', 'player', 'Win', 'Lose', 'TeamBulls', 'derrick rose', 'going', 'favorite player', 'win', 'lose', 'teambulls'] [] 
10400 0.8500000000000001 0.95367431640625 negative 61.18379831314087 0 ['Im', 'im'] [] 
10500 0.0 0.0 negative 61.18486523628235 0 ['Goodnight', 'republicans', 'goodnight', 'progressive republicans'] ['progressive'] 
10600 0.6000000000000001 -0.35 

15400 0.0 0.0 negative 61.18459701538086 0 ['Gallardo', 'Spyder', '+', 'Z06', 'Stingray', 'https', '//t.co/wQhJNk3vlm', 'gallardo spyder', 'z06 stingray'] [] 
15500 0.0 0.0 negative 61.18308901786804 0 ['Automotive', 'Job', 'Baltimore', 'MD', 'MileOne', 'Automotive', 'http', 'Dealership', 'Cars', 'Car', 'VeteranJob', 'automotive', 'job', 'baltimore', 'md', 'mileone automotive', '//t.co/5zq5ajjl0n #', 'dealership', 'cars', 'car', 'veteranjob'] [] 
15600 1.0 -0.6 negative 61.175328493118286 0 ['iPhone', 'W', 'Cold', 'Spring', 'Ln', 'http', '//t.co/kx0f44a0cc', 'cleaning', 'streets', 'opened', 'cold', 'ln'] ['Opened'] 
15700 0.0 0.0 negative 61.18459701538086 0 ['ProjectMgmt', 'Job', 'alert', 'Java', 'Developer/Application', 'Developer', '|', 'CGI', '|', 'Baltimore', 'MD', 'http', 'Jobs', 'projectmgmt', 'job', 'java developer/application developer', 'cgi', '| #', 'baltimore', 'md', '//t.co/nnxlryh5z8 #', 'jobs'] ['Hiring'] 
15800 0.4 0.08333333333333334 negative 61.184996366500854 0 ['Nee

19500 1.0 0.25 negative 61.18468642234802 0 ['pretty liiike'] ["'m"] 
19600 0.0 0.0 negative 60.97674369812012 0 ['rents', 'hubcap', '@', 'Nacho', 'Mamas', 'Canton', 'MD', 'https', '//t.co/R28bjMU9s0', '# hubcap', '# nachomamas # margarita @', 'nacho mamas', 'canton', 'md'] ['Teaching', 'is'] 
19700 0.45 -0.35 negative 61.184149980545044 0 ['Dirty', 'Alley', 'Street', 'Dudley', 'Ave', 'Baltimore', 'http', '//t.co/EkZz0Qee7J', 'debris', 'closed dirty alley', 'street request', 'dudley ave baltimore', 'cleaned'] ['request', 'Cleaned', 'removed'] 
19800 0.4083333333333333 0.175 negative 61.18388772010803 0 ['Sweet', 'sunshine', 'Biere', 'Garde', 'Lips', 'Faith', 'Days', 'Inn', 'Inner', 'Harbor', 'Hotel', 'http', '//t.co/ZjbIieGZ28', 'sweet sunshine', 'drinking', 'biere', 'garde', 'lips', 'faith', 'inner', 'harbor hotel'] ['Drinking', '@'] 
19900 0.0 0.0 negative 61.24871373176575 0 ['Belmont', 'perf', 'Supra', 'premium', 'leather', 'premium', 'laces', 'https', '//t.co/aaaNJhr8BB', 'belmont

In [24]:
df['Subjectivity'] = df['OBJECTID'].map(subj_dict)
df['Polarity'] = df['OBJECTID'].map(pol_dict)
df['Classification (Tensorflow)'] = df['OBJECTID'].map(class_tf_dict)
df['Accuracy (Tensorflow)'] = df['OBJECTID'].map(acc_tf_dict)
df['Classification Number'] = df['OBJECTID'].map(class_num_dict)
df['Nouns'] = df['OBJECTID'].map(nouns_dict)
df['Verbs'] = df['OBJECTID'].map(verbs_dict)
df['NBA'] = df['OBJECTID'].map(nba_dict)

In [25]:
df.to_csv('output/SentimentData.csv')

In [26]:
df.head()

Unnamed: 0,OBJECTID,lat,long,dtg,user_name,user_id,text,Mentions,Hashtags,Subjectivity,Polarity,Classification (Tensorflow),Accuracy (Tensorflow),Classification Number,Nouns,Verbs,NBA
0,1,39.274819,-76.608696,Mon Apr 27 23:00:57 +0000 2015,PandaMc8,388008100.0,wtffffff httpstco2dt5pxqoc2,,,0.0,0.0,negative,61.184597,0,"[WTFFFFFF, https, //t.co/2DT5PxqOc2, wtffffff]",[],
1,2,39.292146,-76.567825,Mon Apr 27 23:01:15 +0000 2015,okaykerra,351905900.0,pretty rick been everywhere and aint been aint...,,,1.0,0.390625,negative,61.184955,0,"[Pretty, Rick, rick, ai n't]","[been, ai, been, been, touched, been]",
2,3,39.293876,-76.682365,Mon Apr 27 23:01:41 +0000 2015,letgoletkarma,44980940.0,im filing exempt tomorrow,,,0.0,0.0,negative,61.184597,0,"[exempt, tomorrow]","['m, filing]",
3,4,39.309108,-76.666054,Mon Apr 27 23:01:47 +0000 2015,PrettyMoee,265631400.0,i got endless videos,,,0.75,-0.125,negative,61.245066,0,"[videos, endless videos]",[got],
4,5,39.281066,-76.631622,Mon Apr 27 23:02:05 +0000 2015,khyona_,2157380000.0,omg they mace the man,,,0.0,0.0,negative,61.184657,0,"[man, omg]",[mace],


In [27]:
enriched_df = pd.read_csv('output/Twitter_locations_Neighborhoods.csv')
enriched_df.head()

Unnamed: 0,OBJECTID,Join_Count,TARGET_FID,Field1,oid_,lat,lon,dtg,user_name,user_id,...,Hashtags,Subjectivity,Polarity,Classification__Tensorflow_,Accuracy__Tensorflow_,Classification_Number,Nouns,Verbs,Long,LABEL
0,1,1,1,0,1.0,39.274819,-76.608696,Mon Apr 27 23:00:57 +0000 2015,PandaMc8,388008100.0,...,,0.0,0.0,negative,61.184597,0,"['WTFFFFFF', 'https', '//t.co/2DT5PxqOc2', 'wt...",[],-76.608696,Riverside
1,2,1,2,1,2.0,39.292146,-76.567825,Mon Apr 27 23:01:15 +0000 2015,okaykerra,351905900.0,...,,1.0,0.390625,negative,61.184955,0,"['Pretty', 'Rick', 'rick', ""ai n't""]","['been', 'ai', 'been', 'been', 'touched', 'been']",-76.567825,Baltimore Highlands
2,3,1,3,2,3.0,39.293876,-76.682365,Mon Apr 27 23:01:41 +0000 2015,letgoletkarma,44980940.0,...,,0.0,0.0,negative,61.184597,0,"['exempt', 'tomorrow']","[""'m"", 'filing']",-76.682365,Edmondson Village
3,4,1,4,3,4.0,39.309108,-76.666054,Mon Apr 27 23:01:47 +0000 2015,PrettyMoee,265631400.0,...,,0.75,-0.125,negative,61.245066,0,"['videos', 'endless videos']",['got'],-76.666054,Northwest Community Action
4,5,1,5,4,5.0,39.281066,-76.631622,Mon Apr 27 23:02:05 +0000 2015,khyona_,2157380000.0,...,,0.0,0.0,negative,61.184657,0,"['man', 'omg']",['mace'],-76.631622,Washington Village/Pigtown


In [28]:
neighborhood_nouns = {}
neighborhood_verbs = {}
neighborhood_hashtags = {}

In [29]:
for row in enriched_df.iterrows():
    neighborhood = row[1]['LABEL']
    if neighborhood not in neighborhood_nouns.keys():
        neighborhood_nouns[neighborhood] =[]
    if neighborhood not in neighborhood_verbs.keys():
        neighborhood_verbs[neighborhood] =[]
    if neighborhood not in neighborhood_hashtags.keys():
        neighborhood_hashtags[neighborhood] =[]

In [30]:
for row in enriched_df.iterrows():
    neighborhood = row[1]['LABEL']
    noun_list = row[1]['Nouns']
    verb_list = row[1]['Verbs']
    hashtag_list = row[1]['Hashtags']
    nouns = noun_list[1:-1].split(',')
    verbs = verb_list[1:-1].split(',')
    
    
    for noun in nouns:
        neighborhood_nouns[neighborhood].append(noun)
    for verb in verbs:
        neighborhood_verbs[neighborhood].append(verb)
    if type(hashtag_list) != float:
        hashtags = hashtag_list[1:-1].split(',')
        for hashtag in hashtags:
            neighborhood_hashtags[neighborhood].append(hashtag)

In [31]:
top_nouns_neighborhood = {}
top_verbs_neighborhood = {}
top_hashtags_neighborhood = {}

In [32]:
for key, value in neighborhood_nouns.items():
    noun_counts = dict(Counter(value))
    sorted_noun_counts = sorted(noun_counts.items(), key=itemgetter(1))
    top_nouns_neighborhood[key] = sorted_noun_counts[len(sorted_noun_counts)-5:]
    print(key, sorted_noun_counts[len(sorted_noun_counts)-5:])

Riverside [("'Complaint'", 25), (" 'Baltimore'", 30), (" 'St'", 32), (" 'request'", 40), (" 'http'", 65)]
Baltimore Highlands [(" 'street request'", 35), (" 'E'", 37), (" 'Baltimore'", 42), (" 'St'", 52), (" 'http'", 95)]
Edmondson Village [(" 'request'", 6), ("'@'", 7), (" 'http'", 8), (" 'breezyblack2202'", 8), (" '@'", 30)]
Northwest Community Action [(" 'niyell_'", 9), (" 'take_kierame'", 11), ("'@'", 15), ("'Shai_toReal'", 18), (" 'shai_toreal'", 19)]
Washington Village/Pigtown [(" 'Baltimore'", 27), ("'@'", 27), (" 'St'", 33), (" '@'", 51), (" 'http'", 67)]
Stadium Area [(" 'https'", 86), (" 'SportsRoadhouse'", 147), (" 'sportsroadhouse'", 147), ("'Orioles'", 149), (" 'http'", 175)]
Coldstream Homestead Montebello [(" 'Alley'", 31), (" 'Street'", 31), (" 'street request'", 31), (" '@'", 42), (" 'http'", 57)]
Irvington [(" 'Friday'", 13), (" 'figured'", 13), (" '@'", 15), (" 'http'", 19), (" 'https'", 43)]
Yale Heights [(" 'lucia ave'", 3), (" 'melo'", 4), (" 'request'", 4), (" 'd

In [33]:
for key, value in neighborhood_hashtags.items():
    ht_counts = dict(Counter(value))
    sorted_ht_counts = sorted(ht_counts.items(), key=itemgetter(1))
    top_hashtags_neighborhood[key] = sorted_ht_counts[len(sorted_ht_counts)-5:]
    print(key, sorted_ht_counts[len(sorted_ht_counts)-5:])

Riverside [(" '#Baltimore'", 2), ("'#photo'", 2), ("'#Baltimore'", 3), ("'#baltimore'", 4), ("'#Ijaa'", 6)]
Baltimore Highlands [("'#FTL'", 1)]
Edmondson Village [("'#Dboywalk'", 1), ("'#DonLemonReporting'", 2), ("'#breezyblack'", 2), (" '#DMV'", 2), ("'#Baltimore'", 4)]
Northwest Community Action [("'#Lrt'", 1)]
Washington Village/Pigtown [("'#DayOutWithThomas'", 1), ("'#photo'", 1), ("'#Baltimore'", 2), ("'#BeatsByDre'", 2), (" '#BeatsByDre'", 2)]
Stadium Area [(" '#Baltimore'", 5), (" '#Orioles'", 6), ("'#Baltimore'", 8), (" '#SportsRoadhouse'", 147), ("'#Orioles'", 152)]
Coldstream Homestead Montebello [(" '#insider'", 1), (" '#slikklornigga'", 1), ("'#oomf'", 1), (" '#baltimore'", 2), ("'#BaltimoreUprising'", 2)]
Irvington [(" '#fullfigured'", 11), (" '#biggirls'", 12), ("'#fullfiguredfriday'", 13), (" '#bbw'", 13), (" '#curvy'", 26)]
Yale Heights [("'#bb'", 1)]
Hamilton Hills [("'#referee'", 1), (" '#Bmore'", 2), ("'#NYC'", 2), (" '#iTunesRadio'", 2), ("'#DB1stImpression'", 3)]
W

In [34]:
for key, value in neighborhood_verbs.items():
    verbs_counts = dict(Counter(value))
    sorted_verb_counts = sorted(verbs_counts.items(), key=itemgetter(1))
    top_verbs_neighborhood[key] = sorted_verb_counts[len(sorted_verb_counts)-5:]
    print(key, sorted_verb_counts[len(sorted_verb_counts)-5:])

Riverside [("'Drinking'", 6), ("'Closed'", 12), ('', 18), ("'Opened'", 24), (" 'Parking'", 25)]
Baltimore Highlands [(" 'Cleaned'", 12), (" 'removed'", 12), ("'request'", 16), (" 'request'", 37), ("'Opened'", 53)]
Edmondson Village [(" 'did'", 3), ("'breezyblack2202'", 3), ("'Opened'", 3), ("'Weeds'", 4), ('', 6)]
Northwest Community Action [(" 'get'", 6), ("'need'", 6), (" 'got'", 7), (" 'be'", 15), ('', 31)]
Washington Village/Pigtown [(' "\'s"', 9), (" 'be'", 9), (" 'is'", 16), ("'Opened'", 19), ('', 44)]
Stadium Area [("'be'", 5), (' "\'s"', 6), (" 'be'", 6), (" 'is'", 7), ('', 79)]
Coldstream Homestead Montebello [(' "\'m"', 23), (" 'going'", 27), (" 'be'", 32), ('"\'m"', 33), ('', 70)]
Irvington [("'was'", 4), ("'is'", 5), (" '@'", 6), ('""\'s""', 22), ('', 38)]
Yale Heights [(" 'shit'", 2), (" 'was'", 2), ("'Opened'", 2), (" 'Parking'", 2), ('', 8)]
Hamilton Hills [("'is'", 16), ("'do'", 16), (" 'do'", 17), (' "\'m"', 22), ('', 138)]
Westport [(" 'removed'", 5), ("'request'", 9)

Chinquapin Park [(" 'go'", 2), ('""\'m""', 2), ("'thats'", 3), ("'is'", 4), ('', 23)]
Pleasant View Gardens [("'looks'", 3), ("'been'", 3), ("'be'", 4), ("'is'", 6), ('', 9)]
Pen Lucy [(' "\'m"', 1), (" 'clicked'", 1), ("'wants'", 1), (" 'is'", 1), ('', 2)]
Cross Keys [(" 'are'", 1), (" 'fucked'", 1), ("'Meeting'", 1), ('""\'m""', 4), ('', 9)]
Rosebank [("'welcome'", 1), ("'take'", 1), ("'Seeing'", 1), ('', 2), ("'Drinking'", 3)]
Lake Evesham [('"\'re"', 1), (" 'finding'", 1), ("'wan'", 1), (" '@'", 1), ('', 3)]
Richnor Springs [(" 'fuck'", 1), ("'Talking'", 1), (" 'bae'", 1), (" 'love'", 1), ("'looked'", 1)]
Parkside [(" 'go'", 3), (" 'do'", 3), ('"\'m"', 4), (" 'get'", 4), ('', 7)]
Middle Branch/Reedbird Parks [(" 'Work'", 1), (" 'Wanted'", 1), (" 'Moved'", 1), ("'blessed'", 1), ('', 1)]
Fairmont [(" 'tommrow'", 1), ("'swear'", 1), (" 'made'", 1), (" 'staying'", 1), ('', 4)]
Armistead Gardens [(" 'covered'", 1), ("'Spending'", 1), (" 'hunting'", 1), (" 'keeps'", 1), (" 'coming'", 1)]

In [35]:
sorted_noun_counts[1:]

[(" 'girl'", 1), (" 'man'", 1), (" 'one'", 1)]

In [36]:
gis = GIS("http://esrifederal.maps.arcgis.com", "james_jones_federal", "QWerty654321@!")

sdf = pd.DataFrame.spatial.from_featureclass("Baltimore_neighborhoods/nhood_2010.shp")
sdf.head()

Unnamed: 0,index,ACRES,COLOR_2,LABEL,NBRDESC,SHAPE,Shape_Area,Shape_Leng
0,0,46.710432,2,Abell,ABELL,"{""rings"": [[[1422345.3370833546, 603620.765450...",2034706.0,5892.827778
1,1,260.023864,2,Allendale,ALLENDALE,"{""rings"": [[[1404989.665027067, 592042.0498981...",11326640.0,14276.845363
2,2,144.678075,2,Arcadia,ARCADIA,"{""rings"": [[[1434376.8304087818, 608229.661088...",6302177.0,12268.078903
3,3,115.584689,5,Arlington,ARLINGTON,"{""rings"": [[[1401059.4859543592, 612450.588014...",5034869.0,9756.115594
4,4,302.585653,2,Armistead Gardens,ARMISTEAD GARDENS,"{""rings"": [[[1437179.4596460313, 597502.828539...",13180630.0,16915.744134


In [37]:
sdf['Most_Common_Nouns'] = sdf['LABEL'].map(top_nouns_neighborhood)
sdf['Most_Common_Verbs'] = sdf['LABEL'].map(top_verbs_neighborhood)
sdf['Most_Common_Hashtags'] = sdf['LABEL'].map(top_hashtags_neighborhood)

In [38]:
sdf.head()

Unnamed: 0,index,ACRES,COLOR_2,LABEL,NBRDESC,SHAPE,Shape_Area,Shape_Leng,Most_Common_Nouns,Most_Common_Verbs,Most_Common_Hashtags
0,0,46.710432,2,Abell,ABELL,"{""rings"": [[[1422345.3370833546, 603620.765450...",2034706.0,5892.827778,"[( 'baltimore', 7), ( '@', 8), ( 'i', 10), ('@...","[( 'do', 4), ( 'been', 6), ('is', 6), ( 'is', ...","[('#MingleMonday', 1), ( '#MingleMondays', 1),..."
1,1,260.023864,2,Allendale,ALLENDALE,"{""rings"": [[[1404989.665027067, 592042.0498981...",11326640.0,14276.845363,"[( 'Rodman', 3), ( 'High', 3), ( 'Baltimore', ...","[( 'running', 3), ( 'was', 3), ( 'be', 5), ( '...","[( '#430', 1), ('#MetGala', 1), ('#classicman'..."
2,2,144.678075,2,Arcadia,ARCADIA,"{""rings"": [[[1434376.8304087818, 608229.661088...",6302177.0,12268.078903,"[( 'request', 8), ( 'Eierman', 8), ( 'Ave', 9)...","[( 'Cleaned', 2), ( 'request', 3), ( 'removed'...","[('#Praise', 1), ( '#Worship', 1)]"
3,3,115.584689,5,Arlington,ARLINGTON,"{""rings"": [[[1401059.4859543592, 612450.588014...",5034869.0,9756.115594,"[( 'My', 7), ('i', 10), ( '@', 14), ( 'http', ...","[('was', 4), ( 'was', 5), ( 'know', 6), ('do',...","[('#Kingjames', 1), ('#Pimlico', 4)]"
4,4,302.585653,2,Armistead Gardens,ARMISTEAD GARDENS,"{""rings"": [[[1437179.4596460313, 597502.828539...",13180630.0,16915.744134,"[('planning', 1), ( 'period', 1), ( 'mouse', 1...","[( 'covered', 1), ('Spending', 1), ( 'hunting'...","[('#eubieBee', 1), ( '#IECA15', 1), ( '#colleg..."


In [39]:
sdf_out = sdf[['LABEL', "Most_Common_Nouns", "Most_Common_Verbs", "Most_Common_Hashtags"]]
sdf_out.to_csv('output/sdf_neighborhoods.csv')
sdf_out.head()

Unnamed: 0,LABEL,Most_Common_Nouns,Most_Common_Verbs,Most_Common_Hashtags
0,Abell,"[( 'baltimore', 7), ( '@', 8), ( 'i', 10), ('@...","[( 'do', 4), ( 'been', 6), ('is', 6), ( 'is', ...","[('#MingleMonday', 1), ( '#MingleMondays', 1),..."
1,Allendale,"[( 'Rodman', 3), ( 'High', 3), ( 'Baltimore', ...","[( 'running', 3), ( 'was', 3), ( 'be', 5), ( '...","[( '#430', 1), ('#MetGala', 1), ('#classicman'..."
2,Arcadia,"[( 'request', 8), ( 'Eierman', 8), ( 'Ave', 9)...","[( 'Cleaned', 2), ( 'request', 3), ( 'removed'...","[('#Praise', 1), ( '#Worship', 1)]"
3,Arlington,"[( 'My', 7), ('i', 10), ( '@', 14), ( 'http', ...","[('was', 4), ( 'was', 5), ( 'know', 6), ('do',...","[('#Kingjames', 1), ('#Pimlico', 4)]"
4,Armistead Gardens,"[('planning', 1), ( 'period', 1), ( 'mouse', 1...","[( 'covered', 1), ('Spending', 1), ( 'hunting'...","[('#eubieBee', 1), ( '#IECA15', 1), ( '#colleg..."
