#  In this notebook we set rules in spacy model to detect profanity words and based on that predict the degree of profanity

In [35]:
import spacy
import pandas as pd
from spacy.language import Language
import spacy_transformers
from spacy.matcher import PhraseMatcher

In [36]:
# read the racial_word file
df = pd.read_csv("racial_word.csv")
df

Unnamed: 0,text,severity_description
0,abbie,Mild
1,abeed,Strong
2,aboe,Mild
3,beaner,Severe
4,beaners,Severe
...,...,...
170,wetbacks,Severe
171,wigger,Severe
172,wop,Strong
173,wophead,Strong


In [37]:
# get the list of all 3 level of profanity words
label = df.groupby('severity_description')
mild_list = label.get_group("Mild")['text'].to_list()
strong_list = label.get_group("Strong")['text'].to_list()
severe_list = label.get_group("Severe")['text'].to_list()

In [49]:
#!python -m spacy download en_core_web_sm

In [50]:
# load model and matcher
nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")

In [51]:
# define patterns 
mild_patterns = [nlp(text) for text in mild_list]
strong_patterns = [nlp(text) for text in strong_list]
severe_patterns = [nlp(text) for text in severe_list]

In [48]:
# Define the custom component to detect profanity in each sentence
@Language.component("profanity_component")
def profanity_component_function(doc):
    
    #inner function to get the degreee of profanity.
    def get_label(obj):
        matcher.add("Mild",mild_patterns)
        matcher.add("Strong",strong_patterns)
        matcher.add("Severe",severe_patterns)

        if any([t.text in severe_list for t in obj]):
            return "Severe"
        elif any([t.text in strong_list for t in obj]):
            return "Strong"
        elif any([t.text.lower() in mild_list for t in obj]):
            return "Mild"
        else:
            return "Neutral"
    
    # check in each of the sentence and assigne the custom component
    for sent in doc.sents:
        sent.set_extension("Profanity_level", getter=get_label, force = True)
    
    return doc


# Add the component to the pipeline after the "ner" component
nlp.add_pipe("profanity_component", name="profanity", after="ner")

ValueError: [E007] 'profanity' already exists in pipeline. Existing names: ['tok2vec', 'tagger', 'parser', 'senter', 'attribute_ruler', 'lemmatizer', 'ner', 'profanity']

In [52]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x191a46cd880>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x191a46cdca0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x191a3ad8dd0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x191a59c8a80>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x191a59c8940>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x191a3c50f20>),
 ('profanity', <function __main__.profanity_component_function(doc)>)]

In [63]:
# Add the component to the pipeline after the "ner" component
doc = nlp("you Aboe. you are zip in the wire.")

# Process the text and print the text and label for the doc.ents
print([(sent.text, sent._.Profanity_level) for sent in doc.sents])

[('you Aboe.', 'Mild'), ('you are zip in the wire.', 'Neutral')]


# Store that model 

In [55]:
model_name = "profaniity_model" 
nlp.to_disk(model_name)

In [62]:
!python -m spacy package profaniity_model prof_model --code prof_component.py

running sdist
running egg_info
creating en_core_web_sm.egg-info
writing en_core_web_sm.egg-info\PKG-INFO
writing dependency_links to en_core_web_sm.egg-info\dependency_links.txt
writing entry points to en_core_web_sm.egg-info\entry_points.txt
writing requirements to en_core_web_sm.egg-info\requires.txt
writing top-level names to en_core_web_sm.egg-info\top_level.txt
writing manifest file 'en_core_web_sm.egg-info\SOURCES.txt'
reading manifest file 'en_core_web_sm.egg-info\SOURCES.txt'
reading manifest template 'MANIFEST.in'
writing manifest file 'en_core_web_sm.egg-info\SOURCES.txt'
running check
creating en_core_web_sm-3.2.0
creating en_core_web_sm-3.2.0\en_core_web_sm
creating en_core_web_sm-3.2.0\en_core_web_sm.egg-info
creating en_core_web_sm-3.2.0\en_core_web_sm\en_core_web_sm-3.2.0
creating en_core_web_sm-3.2.0\en_core_web_sm\en_core_web_sm-3.2.0\attribute_ruler
creating en_core_web_sm-3.2.0\en_core_web_sm\en_core_web_sm-3.2.0\lemmatizer
creating en_core_web_sm-3.2.0\en_core_web_s

2022-03-16 17:44:26.722073: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2022-03-16 17:44:26.722121: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


# you have to import this model step by step for use

#  ............................................. End......................................................