In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

In [2]:
# reading pickle files into pandas dataframes
BK_tweets = pd.read_pickle("./BK_tweets.pkl")
SI_tweets = pd.read_pickle("./SI_tweets.pkl")

### CountVectorizer analysis - Brooklyn

In [4]:
from nltk.corpus import stopwords

BK_stopwords = ['www','https','https twitter','https twitter com','twitter',
                'pic twitter','pic twitter com','twitter com',
                'https www', 'https www instagram','instagram','instagram com',
                'www instagram','www instagram com','at the','in the','of the',
                'the protest','if you','this is','on the','to the','you re']

# filtering out tweets that got fewer than 10 likes to make processing easier
BK_tweets = BK_tweets[BK_tweets.nlikes>=1]

# countvectorizing BK tweets
vectorizer = CountVectorizer(ngram_range=(1,3))

X = vectorizer.fit_transform(BK_tweets['tweet'])

user_ids = list(BK_tweets['user_id'])

BK_countvec = pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names())

# build df with BK user_ids and countvectorized tweets
BK_countvec.insert(0, "user_id", user_ids, False) 

# import list of general stopwords
stopwords = stopwords.words()

# create list of stopwords found in the BK tweets
stopwords = list(set(BK_countvec.columns) & set(stopwords))+BK_stopwords

# remove stopwords from countvectorized BK tweets
BK_countvec = BK_countvec.drop(stopwords,axis=1)

In [5]:
print(BK_countvec.shape)
BK_countvec.head()

(6320, 246132)


Unnamed: 0,user_id,00,00 am,00 am no,00 is,00 is that,00 ll,00 ll be,00 pm,00 pm at,...,𝐍𝐎𝗪 here are,𝐒𝐏𝐄𝐀𝐊,𝐒𝐏𝐄𝐀𝐊 𝐔𝐏,𝐒𝐏𝐄𝐀𝐊 𝐔𝐏 and,𝐔𝐏,𝐔𝐏 and,𝐔𝐏 and 𝐇𝐄𝐋𝐏,𝐭𝐨𝐠𝐞𝐭𝐡𝐞𝐫,𝐭𝐨𝐠𝐞𝐭𝐡𝐞𝐫 blacklivesmatter,𝐭𝐨𝐠𝐞𝐭𝐡𝐞𝐫 blacklivesmatter blm
0,33146847,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,318991844,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,38797346,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1332887239,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1104668994781487106,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
pd.options.display.max_rows = 4000

print('Number of BK tweets: '+str(BK_countvec.shape[0]))
BK_countvec.iloc[:,1:].sum(axis=0).sort_values(ascending=False)[:100]

Number of BK tweets: 6320


police                3690
protest               1672
black                 1233
the police            1142
status                1108
people                 930
blm                    732
floyd                  636
lives                  627
george                 598
brooklyn               597
george floyd           573
matter                 565
blacklivesmatter       563
black lives            491
lives matter           479
like                   475
black lives matter     435
peaceful               413
white                  390
brutality              386
police brutality       367
us                     347
nyc                    343
cops                   342
get                    330
today                  323
nypd                   320
protests               308
justice                299
right                  298
need                   289
new                    286
officers               272
go                     272
to be                  270
see                    266
i

### CountVectorizer analysis - Staten Island

In [17]:
from nltk.corpus import stopwords

SI_stopwords = ['twitter','twitter com','https','https twitter','https twitter com',
                'pic twitter','pic twitter com','in the','if you','you re','of the','petition http',
                'petition http chng','http chng','http chng it','www','https www','instagram',
                'https www instagram','www instagram','www instagram com','http','instagram com']

# countvectorizing SI tweets
vectorizer = CountVectorizer(ngram_range=(1,3))

X = vectorizer.fit_transform(SI_tweets['tweet'])

user_ids = list(SI_tweets['user_id'])

SI_countvec = pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names())

# build df with SI user_ids and countvectorized tweets
SI_countvec.insert(0, "user_id", user_ids, False) 

# import list of general stopwords
stopwords = stopwords.words()

# create list of stopwords found in the SI tweets
stopwords = list(set(SI_countvec.columns) & set(stopwords))+SI_stopwords

# remove stopwords from countvectorized SI tweets
SI_countvec = SI_countvec.drop(stopwords,axis=1)

In [18]:
print(SI_countvec.shape)
SI_countvec.head()

(835, 40585)


Unnamed: 0,user_id,000,000 000,000 000 plastic,000 000 tear,000 blm,000 blm tees,000 deaths,000 deaths 306,000 face,...,zones to,zones to police,zq7f8n4d,zq7f8n4d via,zq7f8n4d via change,zrd5y8k4,zrd5y8k4 via,zrd5y8k4 via change,zrvq3aesi6,zypog0iptu
0,780957816257441792,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,314176322,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,99202763,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,285961054,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,548761315,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
pd.options.display.max_rows = 4000

print('Number of SI tweets: '+str(SI_countvec.shape[0]))
SI_countvec.iloc[:,1:].sum(axis=0).sort_values(ascending=False)[:100]

Number of SI tweets: 835


police                434
protest               250
status                155
people                138
floyd                 133
black                 132
george                129
george floyd          115
the police             95
blm                    91
lives                  74
peaceful               69
matter                 67
change                 64
justice                62
lives matter           59
officers               59
get                    59
black lives            57
like                   55
see                    50
island                 50
black lives matter     49
staten                 47
blacklivesmatter       47
police officers        45
staten island          45
stop                   43
us                     43
think                  43
would                  43
go                     43
right                  43
via                    42
sign                   42
new                    42
today                  41
justice for            40
petition    

In [20]:
pd.options.display.max_colwidth = 300

print('BK tweets')
print('number of times \'cop\' appears: '+str(BK_tweets[BK_tweets['tweet'].str.contains(" cop ")].tweet.shape[0]+
                                             BK_tweets[BK_tweets['tweet'].str.contains(" cops ")].tweet.shape[0]-
                                             BK_tweets[BK_tweets['tweet'].str.contains(" cop ")][BK_tweets['tweet'].str.contains(" cops ")].shape[0]))
print('number of times \'officer\' appears: '+str(BK_tweets[BK_tweets['tweet'].str.contains("officer")].tweet.shape[0]))

print('SI tweets')
print('number of times \'cop\' appears: '+str(SI_tweets[SI_tweets['tweet'].str.contains(" cop ")].tweet.shape[0]+
                                             SI_tweets[SI_tweets['tweet'].str.contains(" cops ")].tweet.shape[0]-
                                             SI_tweets[SI_tweets['tweet'].str.contains(" cop ")][SI_tweets['tweet'].str.contains(" cops ")].shape[0]))
print('number of times \'officer\' appears: '+str(SI_tweets[SI_tweets['tweet'].str.contains("officer")].tweet.shape[0]))

BK tweets
number of times 'cop' appears: 458
number of times 'officer' appears: 589
SI tweets
number of times 'cop' appears: 34
number of times 'officer' appears: 72


  
  if sys.path[0] == '':


### TF-IDF vectorizer analysis - Brooklyn

In [85]:
from nltk.corpus import stopwords

BK_stopwords = ['www','https','https twitter','https twitter com','twitter',
                'pic twitter','pic twitter com','twitter com',
                'https www', 'https www instagram','instagram','instagram com',
                'www instagram','www instagram com','at the','in the','of the',
                'the protest','if you','this is','on the','to the']

# filtering out tweets that got fewer than 10 likes to make processing easier
BK_tweets = BK_tweets[BK_tweets.nlikes>=10]

# tfidf vectorizing BK tweets
vectorizer = TfidfVectorizer(ngram_range=(1,3))

X = vectorizer.fit_transform(BK_tweets['tweet'])

user_ids = list(BK_tweets['user_id'])

tfidf_BK = pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names())

# build df with BK user_ids and countvectorized tweets
tfidf_BK.insert(0, "user_id", user_ids, False) 

# import list of general stopwords
stopwords = stopwords.words()

# create list of stopwords found in the BK tweets
stopwords = list(set(tfidf_BK.columns) & set(stopwords))+BK_stopwords

# remove stopwords from countvectorized BK tweets
tfidf_BK = tfidf_BK.drop(stopwords,axis=1)

In [86]:
tfidf_BK.head()

Unnamed: 0,user_id,00,00 is,00 is that,00 pm,00 pm at,00 pm reward,000,000 dead,000 dead and,...,ztjnoeyzbv,ztk8egctts,zuck,zudujdsan6,zuotnvqwwr,zy3adslzgq,zy3adslzgq at,zy3adslzgq at prospect,zyhfawyzcb,zzqbhlczhs
0,1411902361,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,87384843,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,398136617,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,516918524,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,416506824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [93]:
pd.options.display.max_rows = 4000

print('Number of BK tweets: '+str(tfidf_BK.shape[0]))

# sorting keyword terms by average tfidf score (including zeros)
tfidf_BK.iloc[:,1:].mean(axis=0).sort_values(ascending=False)[:100]

Number of BK tweets: 1326


police                0.016858
protest               0.013188
black                 0.011142
the police            0.009513
status                0.008449
lives                 0.007848
matter                0.007478
people                0.007160
floyd                 0.007149
black lives           0.006989
george                0.006757
lives matter          0.006725
brooklyn              0.006629
george floyd          0.006583
black lives matter    0.006487
blm                   0.005687
blacklivesmatter      0.005602
peaceful              0.004866
today                 0.004694
nyc                   0.004305
barclays              0.004279
nypd                  0.004119
white                 0.004010
like                  0.003907
brutality             0.003880
police brutality      0.003797
fuck                  0.003768
center                0.003671
us                    0.003631
barclays center       0.003515
protests              0.003496
cops                  0.003389
right   

### TF-IDF vectorizer analysis - Staten Island

In [88]:
from nltk.corpus import stopwords

SI_stopwords = ['twitter','twitter com','https','https twitter','https twitter com',
                'pic twitter','pic twitter com','in the','if you','you re','of the']

# tfidf vectorizing BK tweets
vectorizer = TfidfVectorizer(ngram_range=(1,3))

X = vectorizer.fit_transform(SI_tweets['tweet'])

user_ids = list(SI_tweets['user_id'])

tfidf_SI = pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names())

# build df with BK user_ids and countvectorized tweets
tfidf_SI.insert(0, "user_id", user_ids, False) 

# import list of general stopwords
stopwords = stopwords.words()

# create list of stopwords found in the BK tweets
stopwords = list(set(tfidf_SI.columns) & set(stopwords))+SI_stopwords

# remove stopwords from countvectorized BK tweets
tfidf_SI = tfidf_SI.drop(stopwords,axis=1)

In [89]:
tfidf_SI.head()

Unnamed: 0,user_id,01,01 us,01 us george,02,02 george,02 george floyd,06,06 01,06 01 us,...,zh2sz2fx via,zh2sz2fx via change,ziguinchor,ziguinchor region,ziguinchor region senegal,zrd5y8k4,zrd5y8k4 via,zrd5y8k4 via change,zrvq3aesi6,zypog0iptu
0,524076913,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,99202763,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,524076913,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,524076913,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,506497130,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [95]:
pd.options.display.max_rows = 4000

print('Number of SI tweets: '+str(tfidf_SI.shape[0]))

# sorting keyword terms by average tfidf score (including zeros)
tfidf_SI.iloc[:,1:].mean(axis=0).sort_values(ascending=False)[:100]

Number of SI tweets: 592


police                0.018258
protest               0.014505
floyd                 0.011940
george                0.011887
status                0.011270
george floyd          0.011112
black                 0.008843
people                0.008689
justice               0.007314
lives                 0.007246
matter                0.007167
change                0.007019
the police            0.006666
lives matter          0.006596
via                   0.006418
sign                  0.006376
petition              0.006246
justice for           0.006156
http                  0.006061
black lives           0.006003
sign the petition     0.005900
chng                  0.005900
chng it               0.005900
petition http         0.005900
petition http chng    0.005900
via change            0.005900
http chng             0.005900
http chng it          0.005900
sign the              0.005900
the petition http     0.005900
the petition          0.005900
black lives matter    0.005668
peaceful