In [1]:
# Install PRAW

! pip install praw



In [2]:
import praw
import pandas as pd

# Reddit credentials, password stored in .env
# PRAW setup and login goes here, omitted for privacy

reddit = praw.Reddit(client_id=client_id_here, client_secret=cient_secret_here, password=password_here, user_agent=user_agent_here, username=username_here)

In [3]:
# Retrieve tags list from 2020pb GitHub repo

all_locs = pd.read_json('https://raw.githubusercontent.com/2020PB/police-brutality/data_build/all-locations-v2.json')
all_locs = pd.json_normalize(all_locs['data'])
all_locs = all_locs.drop(columns=['edit_at','id'])
def cleanlinks(json):
    links_out = []
    for link in json:
        links_out.append(link['url'])
    return links_out
all_locs['links'] = all_locs['links'].apply(cleanlinks)
all_locs['date'] = pd.to_datetime(all_locs['date'],format='%Y-%m-%d')
all_tags = all_locs['tags'].copy()
tags = set()
for taglist in all_tags:
  for tag in taglist:
    if tag not in tags:
      tags.add(tag)

print(tags)

{'', 'rubber-bullet', 'drive', 'shove', 'headlock', 'threaten', 'celebrity', 'zip-tie', 'non-protest', 'grab', 'legal-observer', 'death', 'homeless', 'pepper-ball', 'knee-on-neck', 'gun', 'hide-badge', 'protester', 'shield', 'spray', 'property-destruction', 'tear-gas', 'lgbtq+', 'abuse-of-power', 'pepper-spray', 'sexual-assault', 'knee', 'horse', 'tear-gas-canister', 'inhumane-treatment', 'tackle', 'bike', 'incitement', 'stun-grenade', 'journalist', 'dog', 'conceal', 'shoot', 'explosive', 'marking-round', 'bystander', 'lrad', 'arrest', 'less-lethal', 'pregnant', 'projectile', 'punch', 'kick', 'gas', 'elderly', 'person-with-disability', 'vehicle', 'beat', 'choke', 'push', 'mace', 'medic', 'bean-bag', 'taser', 'foam-bullet', 'strike', 'throw', 'politician', 'baton', 'body-cam', 'tase', 'wooden-bullet', 'child', 'racial-profiling', 'live-round'}


In [4]:
# Grabbing 1000 hottest posts on Reddit at the moment under the subreddit "2020policebrutality"
# Other subreddits to explore: policebrutality, publicfreakout, allcopnodonut

data = []

for submission in reddit.subreddit("2020policebrutality").hot(limit=1000):
  data.append([submission.id, submission.title, submission.score, submission.subreddit, submission.url, 
               submission.num_comments, submission.selftext, submission.created])

col_names = ['id', 'title', 'score', 'subreddit', 'url', 
             'num_comments', 'text', 'created']
df = pd.DataFrame(data, columns=col_names)

df.head()

Unnamed: 0,id,title,score,subreddit,url,num_comments,text,created
0,jawckp,"Reddit, I have our best 2020PB update so far!",20,2020PoliceBrutality,https://www.reddit.com/r/2020PoliceBrutality/c...,3,Just wanted to give you guys the good news... ...,1602690000.0
1,je5555,I-Team: Evans Police have history of brutality...,485,2020PoliceBrutality,https://www.youtube.com/watch?v=CLk7F-50n28&fe...,14,,1603154000.0
2,je8s6z,"Rapid City police invade Lakota ceremony, tear...",26,2020PoliceBrutality,https://bsnorrell.blogspot.com/2020/10/rapid-c...,4,,1603165000.0
3,je5fzv,Here’s what police body cameras don’t show you,43,2020PoliceBrutality,https://www.youtube.com/watch?v=CVGYqjnXFKE&fe...,1,,1603155000.0
4,jdmd8m,Police in multiple cities have been documented...,152,2020PoliceBrutality,https://twitter.com/zdroberts/status/131710134...,3,,1603078000.0


In [5]:
# Install newspaper3k for article parser

! pip3 install newspaper3k



In [6]:
# Imports

import pandas as pd
import praw
import os
import requests
from bs4 import BeautifulSoup
import re
import pickle
from newspaper import Article
import spacy
from collections import Counter
from datetime import datetime

In [7]:
# NLP: Retrieve lemmas from the reddit titles

nlp = spacy.load('en_core_web_sm')

def get_lemmas(text):
    lemmas = []
    doc = nlp(text)
    for token in doc: 
        tokens = re.sub('[^a-zA-Z 0-9] -', '', text)
        tokens = tokens.lower()
        if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ != 'PRON'):
            lemmas.append(token.lemma_.lower())
    return lemmas

df['lemmas'] = df['title'].apply(get_lemmas)

df.head()

Unnamed: 0,id,title,score,subreddit,url,num_comments,text,created,lemmas
0,jawckp,"Reddit, I have our best 2020PB update so far!",20,2020PoliceBrutality,https://www.reddit.com/r/2020PoliceBrutality/c...,3,Just wanted to give you guys the good news... ...,1602690000.0,"[reddit, good, 2020pb, update, far]"
1,je5555,I-Team: Evans Police have history of brutality...,485,2020PoliceBrutality,https://www.youtube.com/watch?v=CLk7F-50n28&fe...,14,,1603154000.0,"[team, evans, police, history, brutality, alle..."
2,je8s6z,"Rapid City police invade Lakota ceremony, tear...",26,2020PoliceBrutality,https://bsnorrell.blogspot.com/2020/10/rapid-c...,4,,1603165000.0,"[rapid, city, police, invade, lakota, ceremony..."
3,je5fzv,Here’s what police body cameras don’t show you,43,2020PoliceBrutality,https://www.youtube.com/watch?v=CVGYqjnXFKE&fe...,1,,1603155000.0,"[police, body, camera]"
4,jdmd8m,Police in multiple cities have been documented...,152,2020PoliceBrutality,https://twitter.com/zdroberts/status/131710134...,3,,1603078000.0,"[police, multiple, city, document, have, close..."


In [9]:
# Check lemmas list

for i in range(len(df)):
  print(df["lemmas"][i])

['reddit', 'good', '2020pb', 'update', 'far']
['team', 'evans', 'police', 'history', 'brutality', 'allegation', 'settlement']
['rapid', 'city', 'police', 'invade', 'lakota', 'ceremony', 'tear', 'encampment', 'houseless', 'arrest', 'people']
['police', 'body', 'camera']
['police', 'multiple', 'city', 'document', 'have', 'close', 'relationship', 'proud', 'boys', 'far', 'right', 'group', 'america', 'deeply', 'worried', '|', 'zach', 'd', 'roberts', 'twitter']
['instance', 'aggression', 'press', '|', 'u.s.', 'press', 'freedom', 'tracker']
['summer', 'black', 'lives', 'matter', 'protester', 'overwhelmingly', 'peaceful', 'research', 'find', 'short', 'datum', 'suggest', '96.3', 'percent', 'event', 'involve', 'property', 'damage', 'police', 'injury', '97.7', 'percent', 'event', 'injury', 'report', 'participant', 'bystander', 'police']
['domestic', 'violence', 'shelter', 'org', 'say', 'take', 'black', 'life', 'matter', 'sign', 'police', 'wisconsin', 'cut', 'tie']
['teargas', '|', 'maranie', 'sta

In [10]:
# Create a tags-only lemmas list from the lemmas we created by comparing each 
#   lemma to the tags list we retrieved from the 2020pb GitHub repository

lemma_list = []
for i in range(len(df)):
  lemma_instances = []
  if len(df['lemmas'][i]) != 0:
    for word in df["lemmas"][i]:
      if word in tags:
        lemma_instances.append(word)
      else: 
        continue
    lemma_list.append(lemma_instances)
  else:
    lemma_list.append([])

lemma_list

[[],
 [],
 ['arrest'],
 [],
 [],
 [],
 ['protester', 'bystander'],
 [],
 [],
 ['push'],
 ['throw', 'protester'],
 ['gas'],
 [],
 [],
 [],
 ['beat'],
 ['arrest'],
 ['child', 'shoot'],
 [],
 [],
 ['throw'],
 [],
 ['choke', 'beat', 'gun'],
 [],
 [],
 ['dog'],
 [],
 [],
 [],
 ['protester', 'threaten'],
 [],
 ['dog', 'death'],
 ['arrest'],
 [],
 ['push'],
 ['arrest'],
 [],
 ['push'],
 [],
 [],
 [],
 [],
 ['gas', 'protester'],
 [],
 [],
 ['gun'],
 ['journalist', 'arrest'],
 [],
 [],
 ['arrest', 'shoot'],
 [],
 ['medic', 'projectile'],
 [],
 [],
 [],
 ['arrest'],
 ['arrest'],
 [],
 ['shoot', 'knee'],
 ['protester', 'pregnant'],
 ['shoot'],
 [],
 [],
 [],
 [],
 ['arrest', 'gas'],
 ['beat', 'death', 'baton'],
 [],
 [],
 ['shoot'],
 [],
 [],
 [],
 ['baton'],
 ['death', 'dog'],
 [],
 ['drive', 'protester'],
 ['tase'],
 ['protester'],
 ['shoot'],
 [],
 [],
 [],
 ['death'],
 ['shoot'],
 ['death'],
 ['choke'],
 [],
 [],
 [],
 ['conceal', 'death'],
 [],
 ['throw'],
 ['drive'],
 ['arrest'],
 ['shield'

In [13]:
# Append the list of tags to the dataframe 

df["tags"] = lemma_list
df.head()

Unnamed: 0,id,title,score,subreddit,url,num_comments,text,created,lemmas,tags
0,jawckp,"Reddit, I have our best 2020PB update so far!",20,2020PoliceBrutality,https://www.reddit.com/r/2020PoliceBrutality/c...,3,Just wanted to give you guys the good news... ...,1602690000.0,"[reddit, good, 2020pb, update, far]",[]
1,je5555,I-Team: Evans Police have history of brutality...,485,2020PoliceBrutality,https://www.youtube.com/watch?v=CLk7F-50n28&fe...,14,,1603154000.0,"[team, evans, police, history, brutality, alle...",[]
2,je8s6z,"Rapid City police invade Lakota ceremony, tear...",26,2020PoliceBrutality,https://bsnorrell.blogspot.com/2020/10/rapid-c...,4,,1603165000.0,"[rapid, city, police, invade, lakota, ceremony...",[arrest]
3,je5fzv,Here’s what police body cameras don’t show you,43,2020PoliceBrutality,https://www.youtube.com/watch?v=CVGYqjnXFKE&fe...,1,,1603155000.0,"[police, body, camera]",[]
4,jdmd8m,Police in multiple cities have been documented...,152,2020PoliceBrutality,https://twitter.com/zdroberts/status/131710134...,3,,1603078000.0,"[police, multiple, city, document, have, close...",[]


In [14]:
# Classification: Classify articles based on whether tags were found and whether 
# the lemmas contained mentions of police, officer, or cops

df["police_brutality"] = 0

for i in range(len(df)):
  if (len(df["tags"][i]) > 0) and (("police" in df["lemmas"][i]) or ("officer" in df["lemmas"][i]) or ("cop" in df["lemmas"][i])):
    df["police_brutality"][i] = 1

df.head(20)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,id,title,score,subreddit,url,num_comments,text,created,lemmas,tags,police_brutality
0,jawckp,"Reddit, I have our best 2020PB update so far!",20,2020PoliceBrutality,https://www.reddit.com/r/2020PoliceBrutality/c...,3,Just wanted to give you guys the good news... ...,1602690000.0,"[reddit, good, 2020pb, update, far]",[],0
1,je5555,I-Team: Evans Police have history of brutality...,485,2020PoliceBrutality,https://www.youtube.com/watch?v=CLk7F-50n28&fe...,14,,1603154000.0,"[team, evans, police, history, brutality, alle...",[],0
2,je8s6z,"Rapid City police invade Lakota ceremony, tear...",26,2020PoliceBrutality,https://bsnorrell.blogspot.com/2020/10/rapid-c...,4,,1603165000.0,"[rapid, city, police, invade, lakota, ceremony...",[arrest],1
3,je5fzv,Here’s what police body cameras don’t show you,43,2020PoliceBrutality,https://www.youtube.com/watch?v=CVGYqjnXFKE&fe...,1,,1603155000.0,"[police, body, camera]",[],0
4,jdmd8m,Police in multiple cities have been documented...,152,2020PoliceBrutality,https://twitter.com/zdroberts/status/131710134...,3,,1603078000.0,"[police, multiple, city, document, have, close...",[],0
5,jealt6,Instances of aggression against the press | U....,1,2020PoliceBrutality,https://twitter.com/uspresstracker/status/1318...,1,,1603170000.0,"[instance, aggression, press, |, u.s., press, ...",[],0
6,jd47jd,This summer’s Black Lives Matter protesters we...,2997,2020PoliceBrutality,https://www.washingtonpost.com/politics/2020/1...,95,,1603001000.0,"[summer, black, lives, matter, protester, over...","[protester, bystander]",1
7,jdlfgp,Domestic Violence Shelter Org. Says All It Too...,40,2020PoliceBrutality,https://www.theroot.com/domestic-violence-shel...,6,,1603074000.0,"[domestic, violence, shelter, org, say, take, ...",[],0
8,jdmksu,"""so. much. teargas."" | Maranie Staab on Twitter",11,2020PoliceBrutality,https://twitter.com/MaranieRae/status/13177171...,1,,1603078000.0,"[teargas, |, maranie, staab, twitter]",[],0
9,jdknfg,"Federal officers executed several bullrushes, ...",15,2020PoliceBrutality,https://twitter.com/PDocumentarians/status/131...,1,,1603072000.0,"[federal, officer, execute, bullrush, push, cr...",[push],1


In [15]:
# Convert "created" column to actual dates

import datetime
def get_date(time):
	return datetime.date.fromtimestamp(time)
df['date'] = df['created'].apply(get_date)

df.head(20)

Unnamed: 0,id,title,score,subreddit,url,num_comments,text,created,lemmas,tags,police_brutality,date
0,jawckp,"Reddit, I have our best 2020PB update so far!",20,2020PoliceBrutality,https://www.reddit.com/r/2020PoliceBrutality/c...,3,Just wanted to give you guys the good news... ...,1602690000.0,"[reddit, good, 2020pb, update, far]",[],0,2020-10-14
1,je5555,I-Team: Evans Police have history of brutality...,485,2020PoliceBrutality,https://www.youtube.com/watch?v=CLk7F-50n28&fe...,14,,1603154000.0,"[team, evans, police, history, brutality, alle...",[],0,2020-10-20
2,je8s6z,"Rapid City police invade Lakota ceremony, tear...",26,2020PoliceBrutality,https://bsnorrell.blogspot.com/2020/10/rapid-c...,4,,1603165000.0,"[rapid, city, police, invade, lakota, ceremony...",[arrest],1,2020-10-20
3,je5fzv,Here’s what police body cameras don’t show you,43,2020PoliceBrutality,https://www.youtube.com/watch?v=CVGYqjnXFKE&fe...,1,,1603155000.0,"[police, body, camera]",[],0,2020-10-20
4,jdmd8m,Police in multiple cities have been documented...,152,2020PoliceBrutality,https://twitter.com/zdroberts/status/131710134...,3,,1603078000.0,"[police, multiple, city, document, have, close...",[],0,2020-10-19
5,jealt6,Instances of aggression against the press | U....,1,2020PoliceBrutality,https://twitter.com/uspresstracker/status/1318...,1,,1603170000.0,"[instance, aggression, press, |, u.s., press, ...",[],0,2020-10-20
6,jd47jd,This summer’s Black Lives Matter protesters we...,2997,2020PoliceBrutality,https://www.washingtonpost.com/politics/2020/1...,95,,1603001000.0,"[summer, black, lives, matter, protester, over...","[protester, bystander]",1,2020-10-18
7,jdlfgp,Domestic Violence Shelter Org. Says All It Too...,40,2020PoliceBrutality,https://www.theroot.com/domestic-violence-shel...,6,,1603074000.0,"[domestic, violence, shelter, org, say, take, ...",[],0,2020-10-19
8,jdmksu,"""so. much. teargas."" | Maranie Staab on Twitter",11,2020PoliceBrutality,https://twitter.com/MaranieRae/status/13177171...,1,,1603078000.0,"[teargas, |, maranie, staab, twitter]",[],0,2020-10-19
9,jdknfg,"Federal officers executed several bullrushes, ...",15,2020PoliceBrutality,https://twitter.com/PDocumentarians/status/131...,1,,1603072000.0,"[federal, officer, execute, bullrush, push, cr...",[push],1,2020-10-19


In [16]:
# Create dataframe with only incidents that were marked as 1: where police brutality 
# is likely to have occurred 

df_pb = df.loc[df['police_brutality'] == 1].copy() 

print(df_pb.shape)

df_pb.head(10)

(275, 12)


Unnamed: 0,id,title,score,subreddit,url,num_comments,text,created,lemmas,tags,police_brutality,date
2,je8s6z,"Rapid City police invade Lakota ceremony, tear...",26,2020PoliceBrutality,https://bsnorrell.blogspot.com/2020/10/rapid-c...,4,,1603165000.0,"[rapid, city, police, invade, lakota, ceremony...",[arrest],1,2020-10-20
6,jd47jd,This summer’s Black Lives Matter protesters we...,2997,2020PoliceBrutality,https://www.washingtonpost.com/politics/2020/1...,95,,1603001000.0,"[summer, black, lives, matter, protester, over...","[protester, bystander]",1,2020-10-18
9,jdknfg,"Federal officers executed several bullrushes, ...",15,2020PoliceBrutality,https://twitter.com/PDocumentarians/status/131...,1,,1603072000.0,"[federal, officer, execute, bullrush, push, cr...",[push],1,2020-10-19
15,jcdap5,These Invisible Wounds: If an officer beats yo...,1529,2020PoliceBrutality,https://laurajedeed.medium.com/these-invisible...,13,,1602895000.0,"[invisible, wound, officer, beat, middle, port...",[beat],1,2020-10-17
17,jcp4er,13 year old autistic child SHOT BY POLICE whil...,57,2020PoliceBrutality,https://www.youtube.com/watch?v=xJWaqumSu_0&fe...,0,,1602939000.0,"[13, year, old, autistic, child, shoot, police...","[child, shoot]",1,2020-10-17
22,jc0xl7,Family members viewed long-secret body-camera ...,250,2020PoliceBrutality,https://apnews.com/article/john-bel-edwards-ah...,4,,1602842000.0,"[family, member, view, long, secret, body, cam...","[choke, beat, gun]",1,2020-10-16
25,jb5scw,The city where someone was bitten by a police ...,1080,2020PoliceBrutality,https://apple.news/AipBdDhyqQhqavBeSGtvJSg,17,,1602727000.0,"[city, bite, police, dog, 5, day, marshall, pr...",[dog],1,2020-10-15
29,jbgkxq,Pro-Fascism Protesters Threaten and Assult Cou...,30,2020PoliceBrutality,https://youtu.be/foko8KQLiZc,12,,1602764000.0,"[pro, fascism, protester, threaten, assult, co...","[protester, threaten]",1,2020-10-15
31,jagsc4,Alabama's Ugly Secret: Police Dog Attacks. Law...,1820,2020PoliceBrutality,https://www.themarshallproject.org/2020/10/13/...,57,,1602635000.0,"[alabama, ugly, secret, police, dog, attack, l...","[dog, death]",1,2020-10-14
34,jamwxm,"October 12, A LAPD officer pushes a young man ...",36,2020PoliceBrutality,https://twitter.com/ShotOn35mm/status/13155499...,2,,1602653000.0,"[october, 12, lapd, officer, push, young, man,...",[push],1,2020-10-14


In [19]:
# Explore visualization methods using Plotly

# import plotly.express as px

# fig = px.histogram(df_pb, x ="date", color = 'tags', hover_data=df_pb.columns)

# fig.show()

In [18]:
# Compare titles with those in the 2020pb database to ensure no duplicates were found

all_locs.sort_values(by='date', ascending=False).head(30)

Unnamed: 0,links,state,city,description,tags,geolocation,name,date,date_text
1201,[https://twitter.com/DrewCMarine/status/131768...,Oregon,Portland,While dispersing protesters using tear gas and...,"[journalist, less-lethal, pepper-ball, project...","45.4927916, -122.6726079",Federal agents shoot journalist with impact mu...,2020-10-17,October 17th
720,[https://twitter.com/R3volutionDaddy/status/13...,Washington,Seattle,Seattle police officers on bikes charge protes...,"[arrest, bike, protester, push, shove]","47.6141198, -122.3189719",Police tackle protesters without apparent just...,2020-10-17,October 17th
1199,[https://twitter.com/PDocumentarians/status/13...,Oregon,Portland,Protesters marched to an ICE facility in Portl...,"[less-lethal, pepper-ball, pepper-spray, proje...","45.4927916, -122.6726079",Federal agents pepper spray protesters carryin...,2020-10-17,October 17th
1200,[https://twitter.com/DannyJPeterson/status/131...,Oregon,Portland,After pushing protesters away from the ICE fac...,"[less-lethal, pepper-ball, projectile, protest...","45.4927916, -122.6726079",Federal agents deploy tear gas and impact muni...,2020-10-17,October 17th
1203,[https://twitter.com/BaghdadBrian/status/13177...,Oregon,Portland,A journalist filming near the DHS riot line be...,"[journalist, property-destruction, push, shove]","45.4927916, -122.6726079",Journalist shoved over by Federal agents,2020-10-17,October 17th
1202,[https://twitter.com/gravemorgan/status/131771...,Oregon,Portland,"After being dispersing around 10 PM PDT, prote...","[less-lethal, pepper-ball, projectile, protest...","45.4927916, -122.6726079",Federal agents deploy second round of tear gas...,2020-10-17,October 17th
338,[https://twitter.com/thugpatriot/status/131621...,Wisconsin,Wauwatosa,A small group of protesters were marching thro...,"[arrest, protester, tackle]","43.0625506, -87.9957363",Police tackle a protester on a bike,2020-10-13,October 13th
774,[https://twitter.com/NoBayouBridge/status/1315...,Arizona,Ajo,Members of the O'odham Anti Border Collective ...,"[less-lethal, projectile, protester, rubber-bu...","32.2739671, -112.7419162",Indigenous protesters shot and tear gassed,2020-10-12,October 12th
1198,[https://twitter.com/Claudio_Report/status/131...,Oregon,Portland,15 minutes into a protest near the Blazers Boy...,"[arrest, journalist, protester, push, shove]","45.5618069, -122.661302",Police make violent mass arrests,2020-10-10,October 10th
337,[https://twitter.com/TMJ4Stephanie/status/1314...,Wisconsin,Wauwatosa,Police deployed tear gas against protesters ga...,"[less-lethal, projectile, protester, shoot, te...","43.0608482, -88.007371",Police tear gas protesters,2020-10-09,October 9th
