## Set up imports

In [1]:
# install PRAW and newspaper3k
!pip install praw
!pip3 install newspaper3k

Collecting praw
[?25l  Downloading https://files.pythonhosted.org/packages/2c/15/4bcc44271afce0316c73cd2ed35f951f1363a07d4d5d5440ae5eb2baad78/praw-7.1.0-py3-none-any.whl (152kB)
[K     |████████████████████████████████| 153kB 1.7MB/s 
[?25hCollecting websocket-client>=0.54.0
[?25l  Downloading https://files.pythonhosted.org/packages/4c/5f/f61b420143ed1c8dc69f9eaec5ff1ac36109d52c80de49d66e0c36c3dfdf/websocket_client-0.57.0-py2.py3-none-any.whl (200kB)
[K     |████████████████████████████████| 204kB 5.1MB/s 
[?25hCollecting update-checker>=0.17
  Downloading https://files.pythonhosted.org/packages/0c/ba/8dd7fa5f0b1c6a8ac62f8f57f7e794160c1f86f31c6d0fb00f582372a3e4/update_checker-0.18.0-py3-none-any.whl
Collecting prawcore<2.0,>=1.3.0
  Downloading https://files.pythonhosted.org/packages/1d/40/b741437ce4c7b64f928513817b29c0a615efb66ab5e5e01f66fe92d2d95b/prawcore-1.5.0-py3-none-any.whl
Installing collected packages: websocket-client, update-checker, prawcore, praw
Successfully install

In [2]:
import pandas as pd
import praw
import os
import requests
from bs4 import BeautifulSoup
import re
import pickle
from newspaper import Article
import spacy
from collections import Counter
from datetime import datetime

## Set up various objects

In [4]:
def lowerify(text):
    # fix up geolocation dataframe a little
    return text.lower()

# set up cities/states locations datafrane
locs_path = 'https://raw.githubusercontent.com/Lambda-School-Labs/Labs25-Human_Rights_First-TeamB-DS/main/project/cities_states.csv'
locs_df = pd.read_csv(locs_path)
locs_df = locs_df.drop(columns=['Unnamed: 0', 'country'])
locs_df['city_ascii'] = locs_df['city_ascii'].apply(lowerify)
locs_df['admin_name'] = locs_df['admin_name'].apply(lowerify)

# state to city lookup table
states_map = {}
for state in list(locs_df.admin_name.unique()):
    states_map[state] = locs_df[locs_df['admin_name'] == state]['city_ascii'].to_list()

# police brutality indentifying nlp
# make sure to import model.pkl
model_file = open('model.pkl', 'rb')
pipeline = pickle.load(model_file)
model_file.close()

# spacy nlp model
nlp = spacy.load('en_core_web_sm')

# Set up PRAW
# PRAW credentials go here

## Run the update and see what's returned

In [7]:
# Grab data from reddit
data = []
print('Pulling data from Reddit...')
for submission in reddit.subreddit("news").top('week', limit=500):
    data.append([
        submission.id, submission.title, submission.url
    ])
# construct a dataframe with the data
col_names = ['id', 'title', 'url']
df = pd.DataFrame(data, columns=col_names)
print(f'Number of entries initially pulled: {df.shape[0]}\n')

# pull the text from each article itself using newspaper3k
content_list = []
date_list = []
# go through each URL and use newspaper3k to extract data
print('Extracting data via newspaper3k...')
for id_url in df['url']:
    # use newspaper3k to extract text
    article = Article(id_url)
    article.download()
    # if the article doesn't download, the error is thrown in parse()
    try:
        article.parse()
    except:
        # add null values to show no connection
        content_list.append(None)
        date_list.append(None)
        continue
    content_list.append(article.text)
    # this will be null if newspaper3k can't find it
    date_list.append(article.publish_date)
df['text'] = content_list
df['date'] = date_list
print('Number of entries with missing data:')
print(df.isnull().sum(),'\n')

# drop any articles with missing data columns
df = df.dropna()
df = df.reset_index()
df = df.drop(columns='index')
print(f'Resulting entry count: {df.shape[0]}\n')

# convert date column to pandas Timestamps
def timestampify(date):
    return pd.Timestamp(date, unit='s').isoformat()
df['date'] = df['date'].apply(timestampify)

print('Filtering through police brutality filter...')
# use NLP model to filter posts
df['is_police_brutality'] = pipeline.predict(df['title'])
df = df[df['is_police_brutality'] == 1]
df = df.drop(columns='is_police_brutality')
print(f'Number of entries determined to be about police brutality: {df.shape[0]}')

# use spaCy to extract location tokens
tokens_list = []
print('Tokenizing through spaCy...')
for text in df['text']:
    doc = nlp(text)
    ents = [e.text.lower() for e in doc.ents if e.label_ == 'GPE']
    tokens_list.append(ents)
df['tokens'] = tokens_list

# figure out which city and state the article takes place in
city_list = []
state_list = []
geo_list = []
print('Compiling geolocation data...')
for tokens in df['tokens']:
    # set up Counter
    c = Counter(tokens)

    # set up geolocation dict for geo list
    geo_entry = {'lat': None, 'long': None}

    # count which states come back the most, if any
    state_counts = {}
    for state in states_map:
        if c[state] > 0:
            state_counts[state] = c[state]

    # get state(s) that came back the most as dict with lists
    max_count = 0
    max_state = None

    for state in state_counts:
        if state_counts[state] > max_count:
            max_count = state_counts[state]
            max_state = {state: {}}
        elif state_counts[state] == max_count:
            max_state[state] = {}

    # if no state is found
    if max_state is None:
        city_list.append(None)
        state_list.append(None)
        geo_list.append(geo_entry)
        continue

    max_city = None
    # get any cities in tokens based on states
    for state in max_state:  # ideally this should only run once
        city_counts = {}
        for city in states_map[state]:
            if c[city] > 0:
                city_counts[city] = c[city]
        max_state[state] = city_counts

        # get the city/state combo that came back the most
        max_count = 0
        for city in city_counts:
            if city_counts[city] > max_count:
                max_count = city_counts[city]
                max_city = (city, state)

    # if no city is found
    if max_city is None:
        city_list.append(None)
        state_list.append(None)
        geo_list.append(geo_entry)
        continue

    # the city and state should be known now

    city_list.append(max_city[0].title())
    state_list.append(max_city[1].title())
    # now get the geolocation data
    row = locs_df[(
        (locs_df['city_ascii'] == max_city[0]) &
        (locs_df['admin_name'] == max_city[1])
    )]
    row = row.reset_index()
    if row.empty:
        pass
    else:
        geo_entry['lat'] = row['lat'][0]
        geo_entry['long'] = row['lng'][0]
    geo_list.append(geo_entry)

# loop ends, add cities and states onto dataframe
df['city'] = city_list
df['state'] = state_list
df['geocoding'] = geo_list
print('Number of entries where geolocation data could not be found:')
print(df.isnull().sum(),'\n')

# drop any columns with null entries for location
df = df.dropna()
df = df.reset_index()
df = df.drop(columns='index')

# cleanup to match 846 api
def listify(text):
    return [text]
df['links'] = df['url'].apply(listify)
df['description'] = df['text']
df = df.drop(columns=['tokens', 'text'])
df = df[[
    'id', 'state', 'city',
    'date', 'title', 'description',
    'links', 'geocoding'
]]

print(f'Final number of entries: {df.shape[0]}')
df.head()

Pulling data from Reddit...
Number of entries initially pulled: 500

Extracting data via newspaper3k...
Number of entries with missing data:
id         0
title      0
url        0
text      23
date     175
dtype: int64 

Resulting entry count: 325

Filtering through police brutality filter...
Number of entries determined to be about police brutality: 18
Tokenizing through spaCy...
Compiling geolocation data...
Number of entries where geolocation data could not be found:
id            0
title         0
url           0
text          0
date          0
tokens        0
city         13
state        13
geocoding     0
dtype: int64 

Final number of entries: 18


Unnamed: 0,id,state,city,date,title,description,links,geocoding,tokens
5,itf8aw,,,2020-09-15T00:00:00,'Kushner Village' tenants sue to block paying ...,Washington (CNN) A group of tenants filed a la...,[https://edition.cnn.com/2020/09/15/politics/k...,"{'lat': None, 'long': None}","[washington, new york city, kushner village, e..."
45,itwaan,,,2020-09-15T00:00:00,Documents Reveal How the Police Kept Daniel Pr...,"ROCHESTER, N.Y. — It was early June, days afte...",[https://www.nytimes.com/2020/09/15/nyregion/r...,"{'lat': None, 'long': None}","[rochester, n.y., rochester, rochester]"
48,iqixfl,California,San Francisco,2020-09-11T02:41:24+00:00,Tiny California Town Leveled By “Massive Wall ...,In a scene that brought back nightmares of the...,[https://deadline.com/2020/09/california-town-...,"{'lat': 37.7562, 'long': -122.443}","[oroville, concow, san francisco, berry creek,..."
76,isy5uf,,,2020-09-15T00:00:00,Police officer suspended over Melbourne head-s...,Victoria's anti-corruption watchdog will take ...,[https://www.abc.net.au/news/2020-09-15/police...,"{'lat': None, 'long': None}","[victoria, melbourne]"
95,itff55,,,2020-09-15T09:22:00-04:00,"U.S. drops tariffs on Canadian aluminum, avoid...",OTTAWA -- The federal government is celebratin...,[https://www.ctvnews.ca/politics/u-s-drops-tar...,"{'lat': None, 'long': None}","[the united states’, u.s., u.s., lighthizer, c..."
