In [None]:
# install dependencies  
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import spacy
import en_core_web_sm
import pandas as pd

In [None]:
ranked_reports = {
    "Rank 1 - Police Presence": [
        "policeman, policewoman, law enforcement",
        "police officer, cop, five-o, fuzz, DHS",
    ],
    "Rank 2 - Empty-hand": [
        "policeman, policewoman, law enforcement",
        "police officer, cop, five-o, fuzz, DHS",
        "pushed and shoved with shields",
        "grabs, holds and joint locks",
        "punch and kick",
    ],
    "Rank 3 - Blunt Force": [
        "policeman, policewoman, law enforcement",
        "police officer, cop, five-o, fuzz, DHS",
        "rubber bullets",
        "riot rounds",
        "batons",
    ],
    "Rank 4 - Chemical & Electric": [
        "policeman, policewoman, law enforcement",
        "police officer, cop, five-o, fuzz, DHS",
        "tear gas",
        "pepper spray",
        "flashbangs, stun grenade",
        "chemical sprays",
        "Conducted energy devices, CED or tazor",
    ],
    "Rank 5 - Lethal Force": [
        "policeman, policewoman, law enforcement",
        "police officer, cop, five-o, fuzz, DHS",
        "shoot and kill",
        "open fire",
        "deadly force",
        "fatal",
        "dies",
    ],
}

In [None]:
ranked_reports1 = {
    "Rank 1 - Police Presence": [
        "policeman", "policewoman", "law enforcement",
        "police officer, cop, five-o, fuzz, DHS", 
        "protester", "FPS", "officer",
        "Federal Protective Services",
    ],
    "Rank 2 - Empty-hand": [
        "policeman", "policewoman", "law enforcement",
        "police officer", "cop", "five-o", "fuzz, DHS",
        "pushed and shoved with shields", "officer",
        "grabs, holds and joint locks",
        "punch and kick", "thrown to the ground", "hit",
        "charge a protester", "tackle to the ground", 
        "kneel on", "arrest", "protester",
        "FPS", "Federal Protective Services", "zip-ties",
        "police chase and attack", "kicking him", 
        "threw him to the ground", "handcuff him", 
        "kneeling on a protester", "pinning down", 
        "tackle", "shoved to the ground", "violent",
        "officer shove"

    ],
    "Rank 3 - Blunt Force": [
        "policeman", "policewoman", "law enforcement",
        "police officer", "cop", "five-o", "fuzz", "DHS",
        "rubber bullets", "officer",
        "riot rounds",
        "batons", "blood", "hit", "arrest",
        "protester", "FPS", 
        "Federal Protective Services", 
        "strike with baton", "violent",


    ],
    "Rank 4 - Chemical & Electric": [
        "policeman", "policewoman", "law enforcement",
        "police officer", "cop", "five-o", "fuzz", "DHS",
        "tear gas", "officer",
        "pepper spray",
        "flashbangs", "stun grenade",
        "chemical sprays",
        "Conducted energy devices, CED or tazor",
        "blood", "arrest", "protester", "FPS", 
        "Federal Protective Services", "pepper balls",
        "using munitions on prosters", "struck by a round",
        "fire pepper balls and tear gas", 
        "struck in chest by projectile", "violent", 
        "munition", "firing a riot gun", "paintball gun",
        "shots are fired", "fire explosives", 
        "fire impact munitions",


    ],
    "Rank 5 - Lethal Force": [
        "policeman", "policewoman", "law enforcement",
        "police officer", "cop", "five-o", "fuzz", "DHS",
        "shoot and kill", "protester",
        "open fire", "FPS", "officer",
        "Federal Protective Services",
        "deadly force", "fatal",
        "dies", 'kill', "arrest", "violent", 
        "shot and killed",

    ],
}

In [None]:
class TextMatcher:
    """ Generic NLP Text Matching Model """

    class Tokenizer:
        """ Standard SpaCy Tokenizer """
        nlp = spacy.load("en_core_web_sm")

        def __call__(self, text: str) -> list:
            return [
                token.lemma_.lower() for token in self.nlp(text)
                if not token.is_stop and not token.is_punct
            ]

    def __init__(self, train_data: dict, ngram_range=(1, 3), max_features=8000):
        """ Model training on live data at init """
        self.lookup = {k: ' '.join(v) for k, v in train_data.items()}
        self.name_index = list(self.lookup.keys())
        self.tfidf = TfidfVectorizer(
            ngram_range=ngram_range,
            tokenizer=self.Tokenizer(),
            max_features=max_features,
        )
        self.knn = NearestNeighbors(
            n_neighbors=1,
            leaf_size =15,
            metric='cosine',
            algorithm='brute',
            n_jobs=-1,
        ).fit(self.tfidf.fit_transform(self.lookup.values()).todense())
        self.baseline, _ = self._worker('')

    def _worker(self, user_input: str):
        """ Prediction worker method - internal only """
        vec = self.tfidf.transform([user_input]).todense()
        return (itm[0][0] for itm in self.knn.kneighbors(vec))

    def __call__(self, user_input: str) -> str:
        """ Callable object for making predictions """
        dist, idx = self._worker(user_input)
        if dist != self.baseline:
            return self.name_index[int(idx)]
        else:
            return 'Rank 0 - No Police Presence'

In [None]:
# model for ranked reports and ranked reports1
textmatcher = TextMatcher(ranked_reports)
textmatcher1 = TextMatcher(ranked_reports1)

In [None]:
# sample text to run through model
text = """
During a protest at which both pro-police and Black Lives Matter protesters were present near West 7th and Lincoln, members of the two protests appear to exchange words in an intersection. Police charge a protester, a 14 year old girl according to the poster, tackle her to the ground, kneel on her and arrest her.
"""

In [None]:
# sample text using text matching through ranked reports
textmatcher(text)

In [None]:
# sample text using text matching through ranked reports 1
textmatcher1(text)

In [None]:
# pull in reddit data for applying model
df = pd.read_csv("reddit_data.csv", index_col=0)
df.head()

In [None]:
# create column with applied model
df["force_rank1"] = df['description'].apply(lambda x:textmatcher(x))

In [None]:
# breakdown of new columns with catergorization
df['force_rank1'].value_counts()

In [None]:
# breakdown of new columns with catergorization
df['force_rank'].value_counts()

In [None]:
df['description'][df['force_rank1'] == "Rank 1 - Police Presence"]

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
df