In [156]:
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

In [157]:
# reading pickle files into pandas dataframes
BK_tweets = pd.read_pickle("./BK_tweets.pkl")
SI_tweets = pd.read_pickle("./SI_tweets.pkl")

### CountVectorizer analysis - Brooklyn

In [96]:
from nltk.corpus import stopwords

BK_stopwords = ['www','https','https twitter','https twitter com','twitter',
                'pic twitter','pic twitter com','twitter com',
                'https www', 'https www instagram','instagram','instagram com',
                'www instagram','www instagram com','at the','in the','of the',
                'the protest','if you','this is','on the','to the','you re']

# filtering out tweets that got fewer than 10 likes to make processing easier
BK_tweets = BK_tweets[BK_tweets.nlikes>=10]

# countvectorizing BK tweets
vectorizer = CountVectorizer(ngram_range=(1,3))

X = vectorizer.fit_transform(BK_tweets['tweet'])

user_ids = list(BK_tweets['user_id'])

countvec_BK = pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names())

# build df with BK user_ids and countvectorized tweets
countvec_BK.insert(0, "user_id", user_ids, False) 

# import list of general stopwords
stopwords = stopwords.words()

# create list of stopwords found in the BK tweets
stopwords = list(set(countvec_BK.columns) & set(stopwords))+BK_stopwords

# remove stopwords from countvectorized BK tweets
countvec_BK = countvec_BK.drop(stopwords,axis=1)

In [97]:
print(countvec_BK.shape)
countvec_BK.head()

(1326, 65345)


Unnamed: 0,user_id,00,00 is,00 is that,00 pm,00 pm at,00 pm reward,000,000 dead,000 dead and,...,ztjnoeyzbv,ztk8egctts,zuck,zudujdsan6,zuotnvqwwr,zy3adslzgq,zy3adslzgq at,zy3adslzgq at prospect,zyhfawyzcb,zzqbhlczhs
0,1411902361,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,87384843,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,398136617,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,516918524,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,416506824,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [117]:
pd.options.display.max_colwidth = 300

BK_tweets[BK_tweets['tweet'].str.contains("officer")].tweet.shape

(64,)

In [127]:
pd.options.display.max_rows = 4000

print('Number of BK tweets: '+str(countvec_BK.shape[0]))
countvec_BK.iloc[:,1:].sum(axis=0).sort_values(ascending=False)[:100]

Number of BK tweets: 1326


police                763
protest               391
black                 307
the police            235
status                222
people                198
floyd                 170
george                161
george floyd          154
brooklyn              139
lives                 137
blacklivesmatter      128
blm                   128
matter                121
black lives           110
today                 102
peaceful              101
lives matter           99
white                  95
nyc                    94
nypd                   93
like                   91
black lives matter     91
us                     85
brutality              82
police brutality       79
protests               76
barclays               72
right                  72
cops                   69
get                    68
go                     67
justice                61
fuck                   60
protesters             60
violence               60
center                 59
curfew                 58
need        

### CountVectorizer analysis - Staten Island

In [161]:
from nltk.corpus import stopwords

SI_stopwords = ['twitter','twitter com','https','https twitter','https twitter com',
                'pic twitter','pic twitter com','in the','if you','you re','of the','petition http',
                'petition http chng','http chng','http chng it']

# countvectorizing SI tweets
vectorizer = CountVectorizer(ngram_range=(1,3))

X = vectorizer.fit_transform(SI_tweets['tweet'])

user_ids = list(SI_tweets['user_id'])

countvec_SI = pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names())

# build df with SI user_ids and countvectorized tweets
countvec_SI.insert(0, "user_id", user_ids, False) 

# import list of general stopwords
stopwords = stopwords.words()

# create list of stopwords found in the SI tweets
stopwords = list(set(countvec_SI.columns) & set(stopwords))+SI_stopwords

# remove stopwords from countvectorized SI tweets
countvec_SI = countvec_SI.drop(stopwords,axis=1)

In [162]:
print(countvec_SI.shape)
countvec_SI.head()

(592, 30220)


Unnamed: 0,user_id,01,01 us,01 us george,02,02 george,02 george floyd,06,06 01,06 01 us,...,zh2sz2fx via,zh2sz2fx via change,ziguinchor,ziguinchor region,ziguinchor region senegal,zrd5y8k4,zrd5y8k4 via,zrd5y8k4 via change,zrvq3aesi6,zypog0iptu
0,524076913,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,99202763,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,524076913,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,524076913,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,506497130,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [163]:
pd.options.display.max_rows = 4000

print('Number of SI tweets: '+str(countvec_SI.shape[0]))
countvec_SI.iloc[:,1:].sum(axis=0).sort_values(ascending=False)[:100]

Number of SI tweets: 592


police                310
protest               174
status                129
people                111
floyd                 109
george                104
george floyd           93
black                  82
the police             64
change                 49
lives                  49
matter                 47
justice                47
peaceful               45
get                    45
officers               43
blm                    43
would                  40
lives matter           39
like                   39
go                     36
island                 36
via                    36
think                  36
stop                   36
death                  35
black lives            35
sign                   35
right                  33
see                    33
going                  33
justice for            33
staten island          32
petition               32
http                   32
staten                 32
us                     31
police officers        30
black lives 

### TF-IDF vectorizer analysis - Brooklyn

In [85]:
from nltk.corpus import stopwords

BK_stopwords = ['www','https','https twitter','https twitter com','twitter',
                'pic twitter','pic twitter com','twitter com',
                'https www', 'https www instagram','instagram','instagram com',
                'www instagram','www instagram com','at the','in the','of the',
                'the protest','if you','this is','on the','to the']

# filtering out tweets that got fewer than 10 likes to make processing easier
BK_tweets = BK_tweets[BK_tweets.nlikes>=10]

# tfidf vectorizing BK tweets
vectorizer = TfidfVectorizer(ngram_range=(1,3))

X = vectorizer.fit_transform(BK_tweets['tweet'])

user_ids = list(BK_tweets['user_id'])

tfidf_BK = pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names())

# build df with BK user_ids and countvectorized tweets
tfidf_BK.insert(0, "user_id", user_ids, False) 

# import list of general stopwords
stopwords = stopwords.words()

# create list of stopwords found in the BK tweets
stopwords = list(set(tfidf_BK.columns) & set(stopwords))+BK_stopwords

# remove stopwords from countvectorized BK tweets
tfidf_BK = tfidf_BK.drop(stopwords,axis=1)

In [86]:
tfidf_BK.head()

Unnamed: 0,user_id,00,00 is,00 is that,00 pm,00 pm at,00 pm reward,000,000 dead,000 dead and,...,ztjnoeyzbv,ztk8egctts,zuck,zudujdsan6,zuotnvqwwr,zy3adslzgq,zy3adslzgq at,zy3adslzgq at prospect,zyhfawyzcb,zzqbhlczhs
0,1411902361,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,87384843,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,398136617,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,516918524,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,416506824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [93]:
pd.options.display.max_rows = 4000

print('Number of BK tweets: '+str(tfidf_BK.shape[0]))

# sorting keyword terms by average tfidf score (including zeros)
tfidf_BK.iloc[:,1:].mean(axis=0).sort_values(ascending=False)[:100]

Number of BK tweets: 1326


police                0.016858
protest               0.013188
black                 0.011142
the police            0.009513
status                0.008449
lives                 0.007848
matter                0.007478
people                0.007160
floyd                 0.007149
black lives           0.006989
george                0.006757
lives matter          0.006725
brooklyn              0.006629
george floyd          0.006583
black lives matter    0.006487
blm                   0.005687
blacklivesmatter      0.005602
peaceful              0.004866
today                 0.004694
nyc                   0.004305
barclays              0.004279
nypd                  0.004119
white                 0.004010
like                  0.003907
brutality             0.003880
police brutality      0.003797
fuck                  0.003768
center                0.003671
us                    0.003631
barclays center       0.003515
protests              0.003496
cops                  0.003389
right   

### TF-IDF vectorizer analysis - Staten Island

In [88]:
from nltk.corpus import stopwords

SI_stopwords = ['twitter','twitter com','https','https twitter','https twitter com',
                'pic twitter','pic twitter com','in the','if you','you re','of the']

# tfidf vectorizing BK tweets
vectorizer = TfidfVectorizer(ngram_range=(1,3))

X = vectorizer.fit_transform(SI_tweets['tweet'])

user_ids = list(SI_tweets['user_id'])

tfidf_SI = pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names())

# build df with BK user_ids and countvectorized tweets
tfidf_SI.insert(0, "user_id", user_ids, False) 

# import list of general stopwords
stopwords = stopwords.words()

# create list of stopwords found in the BK tweets
stopwords = list(set(tfidf_SI.columns) & set(stopwords))+SI_stopwords

# remove stopwords from countvectorized BK tweets
tfidf_SI = tfidf_SI.drop(stopwords,axis=1)

In [89]:
tfidf_SI.head()

Unnamed: 0,user_id,01,01 us,01 us george,02,02 george,02 george floyd,06,06 01,06 01 us,...,zh2sz2fx via,zh2sz2fx via change,ziguinchor,ziguinchor region,ziguinchor region senegal,zrd5y8k4,zrd5y8k4 via,zrd5y8k4 via change,zrvq3aesi6,zypog0iptu
0,524076913,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,99202763,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,524076913,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,524076913,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,506497130,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [95]:
pd.options.display.max_rows = 4000

print('Number of SI tweets: '+str(tfidf_SI.shape[0]))

# sorting keyword terms by average tfidf score (including zeros)
tfidf_SI.iloc[:,1:].mean(axis=0).sort_values(ascending=False)[:100]

Number of SI tweets: 592


police                0.018258
protest               0.014505
floyd                 0.011940
george                0.011887
status                0.011270
george floyd          0.011112
black                 0.008843
people                0.008689
justice               0.007314
lives                 0.007246
matter                0.007167
change                0.007019
the police            0.006666
lives matter          0.006596
via                   0.006418
sign                  0.006376
petition              0.006246
justice for           0.006156
http                  0.006061
black lives           0.006003
sign the petition     0.005900
chng                  0.005900
chng it               0.005900
petition http         0.005900
petition http chng    0.005900
via change            0.005900
http chng             0.005900
http chng it          0.005900
sign the              0.005900
the petition http     0.005900
the petition          0.005900
black lives matter    0.005668
peaceful