In [1]:
# install PRAW and newspaper3k
!pip install praw
!pip3 install newspaper3k



In [2]:
# All imports
import pandas as pd
import praw
import os
import requests
from bs4 import BeautifulSoup
import re
import pickle
from newspaper import Article
import spacy
from collections import Counter
from datetime import datetime

In [3]:
# Fix geolocation dataframe by lower all case
def lowerify(text):
    return text.lower()

# Set up cities/states locations datafrane
locs_path = 'https://raw.githubusercontent.com/Lambda-School-Labs/Labs25-Human_Rights_First-TeamB-DS/main/project/cities_states.csv'
locs_df = pd.read_csv(locs_path)
locs_df = locs_df.drop(columns=['Unnamed: 0', 'country'])
locs_df['city_ascii'] = locs_df['city_ascii'].apply(lowerify)
locs_df['admin_name'] = locs_df['admin_name'].apply(lowerify)

# State to city lookup table
# Mapping output values list 
states_map = {}
for state in list(locs_df.admin_name.unique()):
    states_map[state] = locs_df[locs_df['admin_name'] == state]['city_ascii'].to_list()

# Police brutality indentifying nlp
model_file = open('model.pkl', 'rb')
pipeline = pickle.load(model_file)
model_file.close()

# Spacy nlp model
nlp = spacy.load('en_core_web_sm')

# Set up PRAW
# PRAW credentials go here
reddit = praw.Reddit(client_id=CLIENT_ID, client_secret=CLIENT_SECRET, password=PASSWORD, user_agent=USER_AGENT, username=USERNAME)

In [4]:
# Grab data from reddit
data = []
print('Pulling data from Reddit...')
for submission in reddit.subreddit("news").top('week', limit=500):
    data.append([
        submission.id, submission.title, submission.url
    ])

# Construct a dataframe with the data, print number of posts created
col_names = ['id', 'title', 'url']
df = pd.DataFrame(data, columns=col_names)
print(f'Number of entries initially pulled: {df.shape[0]}\n')

Pulling data from Reddit...
Number of entries initially pulled: 428



In [5]:
# Pull the text from each article itself using newspaper3k
content_list = []
date_list = []

# Use newspaper3k to extract data
print('Extracting data via newspaper3k...')
for id_url in df['url']:
    article = Article(id_url)
    article.download()
    # If the article doesn't download, the error is thrown in parse()
    try:
        article.parse()
    except:
        # Add null values to show no connection
        content_list.append(None)
        date_list.append(None)
        continue
    content_list.append(article.text)
    date_list.append(article.publish_date)
df['text'] = content_list
df['date'] = date_list
print('Number of entries with missing data:')
print(df.isnull().sum(),'\n')

Extracting data via newspaper3k...
Number of entries with missing data:
id         0
title      0
url        0
text      17
date     151
dtype: int64 



In [6]:
# Drop any articles with missing data columns
df = df.dropna()
df = df.reset_index()
df = df.drop(columns='index')
print(f'Resulting entry count: {df.shape[0]}\n')

# Convert date column to pandas timestamps
def timestampify(date):
    return pd.Timestamp(date, unit='s').isoformat()
df['date'] = df['date'].apply(timestampify)

print('Filtering through police brutality filter...')

Resulting entry count: 277

Filtering through police brutality filter...


In [7]:
# Use NLP model to filter posts
df['is_police_brutality'] = pipeline.predict(df['title'])
df = df[df['is_police_brutality'] == 1]
df = df.drop(columns='is_police_brutality')
print(f'Number of entries determined to be about police brutality: {df.shape[0]}')

Number of entries determined to be about police brutality: 17


In [8]:
# Use spaCy to extract location tokens
tokens_list = []
print('Tokenizing through spaCy...')
for text in df['text']:
    doc = nlp(text)
    ents = [e.text.lower() for e in doc.ents if e.label_ == 'GPE']
    tokens_list.append(ents)
df['tokens'] = tokens_list

Tokenizing through spaCy...


In [9]:
# Figure out which city and state the article takes place in
city_list = []
state_list = []
geo_list = []
print('Compiling geolocation data...')
for tokens in df['tokens']:
    c = Counter(tokens)

    geo_entry = {'lat': None, 'long': None}

    state_counts = {}
    for state in states_map:
        if c[state] > 0:
            state_counts[state] = c[state]

    max_count = 0
    max_state = None

    for state in state_counts:
        if state_counts[state] > max_count:
            max_count = state_counts[state]
            max_state = {state: {}}
        elif state_counts[state] == max_count:
            max_state[state] = {}

    # If no state is found
    if max_state is None:
        city_list.append(None)
        state_list.append(None)
        geo_list.append(geo_entry)
        continue

    max_city = None
    for state in max_state:  
        city_counts = {}
        for city in states_map[state]:
            if c[city] > 0:
                city_counts[city] = c[city]
        max_state[state] = city_counts

        max_count = 0
        for city in city_counts:
            if city_counts[city] > max_count:
                max_count = city_counts[city]
                max_city = (city, state)

    # If no city is found
    if max_city is None:
        city_list.append(None)
        state_list.append(None)
        geo_list.append(geo_entry)
        continue

    # Append city and state to geolocation data
    city_list.append(max_city[0].title())
    state_list.append(max_city[1].title())
    # Now get the geolocation data
    row = locs_df[(
        (locs_df['city_ascii'] == max_city[0]) &
        (locs_df['admin_name'] == max_city[1])
    )]
    row = row.reset_index()
    if row.empty:
        pass
    else:
        geo_entry['lat'] = row['lat'][0]
        geo_entry['long'] = row['lng'][0]
    geo_list.append(geo_entry)



Compiling geolocation data...


In [10]:
# Loop ends, add cities and states onto dataframe
df['city'] = city_list
df['state'] = state_list
df['geocoding'] = geo_list
print('Number of entries where geolocation data could not be found:')
print(df.isnull().sum(),'\n')

# Drop any columns with null entries for location
df = df.dropna()
df = df.reset_index()
df = df.drop(columns='index')

# Cleanup to match 846 api
def listify(text):
    return [text]
df['links'] = df['url'].apply(listify)
df['description'] = df['text']
df = df.drop(columns=['tokens', 'text'])
df = df[[
    'id', 'state', 'city',
    'date', 'title', 'description',
    'links', 'geocoding'
]]

print(f'Final number of entries: {df.shape[0]}')
df.head()

Number of entries where geolocation data could not be found:
id            0
title         0
url           0
text          0
date          0
tokens        0
city         13
state        13
geocoding     0
dtype: int64 

Final number of entries: 4


Unnamed: 0,id,state,city,date,title,description,links,geocoding
0,j3d4ia,Oregon,Portland,2020-10-01T00:00:00,US Justice Department cracks down on Portland ...,"At the end of August, in a green leafy park in...",[https://www.opb.org/article/2020/10/01/us-jus...,"{'lat': 45.5371, 'long': -122.65}"
1,j2c6rw,California,San Jose,2020-09-30T00:42:00+00:00,San Jose officer facing assault charge in viol...,"A San Jose, California, police officer is faci...",[https://www.nbcnews.com/news/us-news/san-jose...,"{'lat': 37.3021, 'long': -121.8489}"
2,j2qr2c,California,San Jose,2020-09-30T13:19:54+00:00,San Jose Officer Facing Assault Charge in Viol...,"A San Jose, California, police officer is faci...",[https://www.nbcboston.com/news/national-inter...,"{'lat': 37.3021, 'long': -121.8489}"
3,j4e5ys,California,San Francisco,2020-10-03T05:19:51+00:00,Better weather won't keep California from grim...,SAN FRANCISCO (AP) — Red flag warnings of extr...,[https://apnews.com/article/wildfires-fires-ca...,"{'lat': 37.7562, 'long': -122.443}"
