In [55]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import spacy
from spacy.tokens import Span
from spacy.matcher import Matcher
from spacy.pipeline import EntityRuler

# Contents

- [Function Definitions](#fdef)
- [NLP Set-up](#NLP)
- [Data Processing](#data)

# Function Definitions
<a id = "fdef"></a>

---------

In [21]:
# processes a dataframe with a 'Text' row using spacy nlp
# returns a copy of the dataframe with a new 'docs' column

def get_docs(df, nlp):
    new_df = df.copy()
    len_df = len(df)
    docs = []
    
    count = 0
    
    for text in df['Text']:
        try:
            doc = nlp(text.lower())
        except:
            doc = []
        
        count += 1
        if count % 1000 == 0:
            print(f'nlp progress: {round(100.0 * count / len_df, 1)}%')
        
        docs.append(doc)
    
    new_df['docs'] = docs
        
    return new_df

In [27]:
def find_matches(df):
    new_df = df.copy()
    
    to_keep = []
    
    count = 0
    
    row_counter = 0

    for doc in new_df['docs']:
        
        if doc == []:
            to_keep.append(False)
            continue
        
        keep = False

        for ent in doc.ents:
            e = ent.label_
            if(e == "street" or e == 'highway' or e == 'exit'):
                keep = True
                row_counter+=1
                continue

        to_keep.append(keep)
        
        count += 1
    
    print(f"Tried to find matches in {count} documents, found {row_counter} rows with matches.")
    
    return new_df[to_keep]

In [23]:
#This function finds where the docs have our new entity types and creates new dataframe rows
#that correspond to each entity type

def make_street_cols(df):
    new_df = df.copy()
    
    streets = []
    highways = []
    exits = []
    markers = []
    
    for doc in df['docs']:
        doc_streets = []
        doc_highways = []
        doc_exits = []
        doc_markers = []
        
        if(doc != []):
            for ent in doc.ents:
                if(ent.label_ == 'street'):
                    doc_streets.append(ent.text)
                elif(ent.label_ == 'highway'):
                    doc_highways.append(ent.text)
                elif(ent.label_ == 'exit'):
                    doc_exits.append(ent.text)
                elif(ent.label_ == 'marker'):
                    doc_markers.append(ent.text)
        
        streets.append(doc_streets)
        highways.append(doc_highways)
        exits.append(doc_exits)
        markers.append(doc_markers)
    
    new_df['streets'] = streets
    new_df['highways'] = highways
    new_df['exits'] = exits
    new_df['markers'] = markers
    
    return new_df

In [24]:
def extract_road_info(df, nlp):
    new_df = df.drop_duplicates().copy()
    new_df = get_docs(new_df, nlp)
    new_df = find_matches(new_df)
    new_df = make_street_cols(new_df)
    return new_df

# NLP
------

In [8]:
nlp = spacy.load('en_core_web_sm')

In [9]:
#Patterns for entity ruler

street_suff = ['street', 'st', 'avenue', 'ave', 'boulevard', 'blvd', 'highway', 'hwy',
              'circle', 'drive', 'lane', 'road', 'rd', 'way', 'place', 'court',
              'parkway', 'pkwy', 'turnpike', 'tpke', 'trnpk', 'turnpk']

street_pattern = {
    'label' : "street",
    'pattern': [
        {"POS" : {"IN" : ["NOUN","PROPN", "ADJ"]}},
        {"LOWER" : {"IN" : street_suff }} ]
}

#matches strings that look like interstates, ie. I-5, I-15, I-345
interstate_pattern = {
    'label' : "highway",
    'pattern' : [
        {"TEXT" : {"REGEX" : "[Ii]-\d*"}} ]
}

#matches strings that look like US highways, ie. US-123
us_hwy_pattern = {    
    'label' : "highway",
    'pattern' : [
        {"TEXT" : {"REGEX" : "[Uu][Ss]-\d*"}} ]
}

#matches strings that look like other highway names, including 'US' without the dash
oth_hwy_pattern = {    
    'label' : "highway",
    'pattern' : [
        {'LOWER' : {"IN" : ['us', 'highway', 'hwy', 'route', 'rt', 'rte']}},
        {"LIKE_NUM" : True} ]
}

#matches spans that look like exit ramp numbers
exit_pattern = {
    'label' : "exit",
    'pattern' : [
        {"LOWER" : "exit"},
        {"LIKE_NUM" : True} ]
}

state_hwy_pattern = {
    'label' : "highway",
    'pattern' : [
        {"LOWER" : {"IN" : ['la', 'fl', 'al', 'ga', 'nc', 'sc', 'ms']}},
        {"LIKE_NUM" : True}
    ]
}

#matches spans that look like mile markers
marker_pattern = {
    'label' : "marker",
    'pattern' : [
        {"LOWER" : "mile"},
        {"LOWER" : {"IN" : ["marker", "post"]}}, {"LIKE_NUM" : True} ]
}

patterns = [street_pattern, interstate_pattern, us_hwy_pattern, state_hwy_pattern, oth_hwy_pattern, exit_pattern, marker_pattern]

In [10]:
ruler = EntityRuler(nlp)
ruler.add_patterns(patterns)
nlp.add_pipe(ruler, before = 'ner')

# Data Processing
<a id="data"></a>

-----

In [36]:
tweets_df = pd.read_csv('./tweets_data/lake_ch_tweets.csv')

In [37]:
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12749 entries, 0 to 12748
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   User       12749 non-null  object
 1   Text       12729 non-null  object
 2   Date       12749 non-null  object
 3   Favorites  12749 non-null  int64 
 4   Retweets   12749 non-null  int64 
 5   Mentions   1367 non-null   object
 6   HashTags   2835 non-null   object
dtypes: int64(2), object(5)
memory usage: 697.3+ KB


In [51]:
%%time
tweets_df_streets = extract_road_info(tweets_df, nlp)

nlp progress: 8.6%
nlp progress: 17.1%
nlp progress: 25.7%
nlp progress: 34.3%
nlp progress: 42.9%
nlp progress: 51.4%
nlp progress: 60.0%
nlp progress: 68.6%
nlp progress: 77.2%
nlp progress: 85.7%
nlp progress: 94.3%
Tried to find matches in 11637 documents, encountered 0 errors when assigning entities.
Wall time: 1min 52s


In [43]:
tweets_df_streets.head()

Unnamed: 0,User,Text,Date,Favorites,Retweets,Mentions,HashTags,docs,streets,highways,exits,markers
18,OKDOT,EDMOND: The northbound I-35 off-ramp to Waterl...,2020-09-03 23:30:16+00:00,0,1,,#trucking,"(edmond, :, the, northbound, i-35, off, -, ram...",[waterloo rd],[i-35],[],[]
34,183South,"THUR (9/3) 8 p.m. to 6 a.m., US 183 southbound...",2020-09-03 23:00:51+00:00,0,1,,#atxtraffic,"(thur, (, 9/3, ), 8, p.m., to, 6, a.m., ,, us,...",[],[us 183],[],[]
35,OKDOT,OKC: The following ramps will be closed Thursd...,2020-09-03 23:00:11+00:00,0,0,@BNSFRailway,,"(okc, :, the, following, ramps, will, be, clos...",[63rd st],"[i-235, i-44, i-44, i-44, us-77, i-235]",[],[]
49,myBIGinsurance,Did you know that texting for five seconds whi...,2020-09-03 22:30:00+00:00,0,0,,,"(did, you, know, that, texting, for, five, sec...",[national highway],[],[],[]
91,MBenoit52,"Unbelievable. Just like Trump and his minions,...",2020-09-03 21:11:40+00:00,0,0,,,"(unbelievable, ., just, like, trump, and, his,...",[fifth avenue],[],[],[]


In [52]:
tweets_df_streets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1160 entries, 125 to 12732
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   User       1160 non-null   object
 1   Text       1160 non-null   object
 2   Date       1160 non-null   object
 3   Favorites  1160 non-null   int64 
 4   Retweets   1160 non-null   int64 
 5   Mentions   77 non-null     object
 6   HashTags   715 non-null    object
 7   docs       1160 non-null   object
 8   streets    1160 non-null   object
 9   highways   1160 non-null   object
 10  exits      1160 non-null   object
 11  markers    1160 non-null   object
dtypes: int64(2), object(10)
memory usage: 117.8+ KB


In [54]:
tweets_df_streets.to_csv('tweets_data/tweets_df_extended.csv')

In [11]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x2be3dd8e808>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x2be199714c8>),
 ('entity_ruler', <spacy.pipeline.entityruler.EntityRuler at 0x2be3baae2c8>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x2be3b763a08>)]

In [12]:
import pickle

In [13]:
pickle.dump(nlp, open('./models/tweet_nlp.p', 'wb'))

In [51]:
pickle.dump(rf, open('./models/RoadFinder.p', 'wb'))