

![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://githubtocolab.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/SENTIMENT_EN_CYBERBULLYING.ipynb)




# **Detect bullying in tweets**

## 1. Colab Setup

In [None]:
# Install PySpark and Spark NLP
! pip install -q pyspark==3.3.0 spark-nlp==4.2.8

In [2]:
import json
import pandas as pd
import numpy as np

import sparknlp
import pyspark.sql.functions as F

from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from sparknlp.annotator import *
from sparknlp.base import *
from sparknlp.pretrained import PretrainedPipeline
from pyspark.sql.types import StringType, IntegerType

## 2. Start Spark Session

In [3]:
spark = sparknlp.start()
print ("Spark NLP Version :", sparknlp.version())
spark

Spark NLP Version : 4.2.8


## 3. Select the DL model

In [4]:
MODEL_NAME='classifierdl_berturk_cyberbullying'

## 4. Some sample examples

In [5]:
text_list = ["""Gidişin olsun, dönüşün olmasın inşallah senin..""",
         """Gidişin ile dönüşün çok sürmez inşallah senin.""",
         """Geberesice sırtlan soyu seni.""",
         """Sırtlanların çölde geberdiğini görünce üzülen bir insandım.""",
         """Bu ne aptal bir adam böyle."""]

## 5. Define Spark NLP pipeline

In [7]:
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")
      
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
    .setInputCols("normalized")\
    .setOutputCol("cleanTokens")\
    .setCaseSensitive(False)

lemma = LemmatizerModel.pretrained('lemma_antbnc') \
    .setInputCols(["cleanTokens"]) \
    .setOutputCol("lemma")
    
berturk_embeddings = BertEmbeddings.pretrained("bert_base_turkish_uncased", "tr") \
      .setInputCols("document", "lemma") \
      .setOutputCol("embeddings")

embeddingsSentence = SentenceEmbeddings() \
      .setInputCols(["document", "embeddings"]) \
      .setOutputCol("sentence_embeddings") \
      .setPoolingStrategy("AVERAGE")

document_classifier = ClassifierDLModel.pretrained(MODEL_NAME, 'tr') \
  .setInputCols(["sentence_embeddings"]) \
  .setOutputCol("class_")

nlpPipeline = Pipeline(
      stages = [
          documentAssembler, 
          tokenizer, normalizer, 
          stopwords_cleaner, 
          lemma, 
          berturk_embeddings, 
          embeddingsSentence, 
          document_classifier])


lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
bert_base_turkish_uncased download started this may take some time.
Approximate size to download 395.8 MB
[OK!]
classifierdl_berturk_cyberbullying download started this may take some time.
Approximate size to download 22.5 MB
[OK!]


## 6. Run the pipeline

In [8]:
df = spark.createDataFrame(text_list, StringType()).toDF("text")
result = nlpPipeline.fit(df).transform(df)

## 7. Visualize results

In [9]:
result.select(F.explode(F.arrays_zip(result.document.result, 
                                     result.class_.result)).alias("cols")) \
      .select(F.expr("cols['0']").alias("document"),
              F.expr("cols['1']").alias("class")).show(truncate=False)

+-----------------------------------------------------------+-------+
|document                                                   |class  |
+-----------------------------------------------------------+-------+
|Gidişin olsun, dönüşün olmasın inşallah senin..            |Pozitif|
|Gidişin ile dönüşün çok sürmez inşallah senin.             |Pozitif|
|Geberesice sırtlan soyu seni.                              |Negatif|
|Sırtlanların çölde geberdiğini görünce üzülen bir insandım.|Pozitif|
|Bu ne aptal bir adam böyle.                                |Negatif|
+-----------------------------------------------------------+-------+

