

![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://githubtocolab.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/SENTIMENT_EN_CYBERBULLYING.ipynb)




# **Detect bullying in tweets**

## 1. Colab Setup

In [None]:
!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash
# !bash colab.sh
# -p is for pyspark
# -s is for spark-nlp
# !bash colab.sh -p 3.1.1 -s 3.0.1
# by default they are set to the latest

--2021-08-13 11:47:48--  http://setup.johnsnowlabs.com/colab.sh
Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]
--2021-08-13 11:47:48--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1608 (1.6K) [text/plain]
Saving to: ‘STDOUT’


2021-08-13 11:47:49 (27.7 MB/s) - written to stdout [1608/1608]

setup Colab for PySpark 3.1.2 and Spark NLP 3.2.1
Get:1 http://security.ubuntu.com/ubuntu 

In [None]:
import pandas as pd
import numpy as np
import json
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline

## 2. Start Spark Session

In [None]:
spark = sparknlp.start()

## 3. Select the DL model

In [None]:
MODEL_NAME='classifierdl_berturk_cyberbullying'

## 4. Some sample examples

In [None]:
text_list = ["""Gidişin olsun, dönüşün olmasın inşallah senin..""",
         """Gidişin ile dönüşün çok sürmez inşallah senin.""",
         """Geberesice sırtlan soyu seni.""",
         """Sırtlanların çölde geberdiğini görünce üzülen bir insandım.""",
         """Bu ne aptal bir adam böyle."""]

## 5. Define Spark NLP pipeline

In [None]:
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")


tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")
      
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
    .setInputCols("normalized")\
    .setOutputCol("cleanTokens")\
    .setCaseSensitive(False)

lemma = LemmatizerModel.pretrained('lemma_antbnc') \
    .setInputCols(["cleanTokens"]) \
    .setOutputCol("lemma")
    
berturk_embeddings = BertEmbeddings.pretrained("bert_base_turkish_uncased", "tr") \
      .setInputCols("document", "lemma") \
      .setOutputCol("embeddings")

embeddingsSentence = SentenceEmbeddings() \
      .setInputCols(["document", "embeddings"]) \
      .setOutputCol("sentence_embeddings") \
      .setPoolingStrategy("AVERAGE")

document_classifier = ClassifierDLModel.pretrained(MODEL_NAME, 'tr') \
  .setInputCols(["document", "sentence_embeddings"]) \
  .setOutputCol("class")

nlpPipeline = Pipeline(
      stages = [
         documentAssembler, tokenizer, normalizer, stopwords_cleaner, lemma, berturk_embeddings, embeddingsSentence, document_classifier
      ])


lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
bert_base_turkish_uncased download started this may take some time.
Approximate size to download 395.8 MB
[OK!]
classifierdl_berturk_cyberbullying download started this may take some time.
Approximate size to download 22.5 MB
[OK!]


## 6. Run the pipeline

In [None]:
empty_df = spark.createDataFrame([['']]).toDF("text")
pipelineModel = nlpPipeline.fit(empty_df)
df = spark.createDataFrame(pd.DataFrame({"text":text_list}))
result = pipelineModel.transform(df)

In [None]:
result.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|               token|          normalized|         cleanTokens|               lemma|          embeddings| sentence_embeddings|               class|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Gidişin olsun, dö...|[{document, 0, 46...|[{token, 0, 6, Gi...|[{token, 0, 6, Gi...|[{token, 0, 6, Gi...|[{token, 0, 6, Gi...|[{word_embeddings...|[{sentence_embedd...|[{category, 0, 46...|
|Gidişin ile dönüş...|[{document, 0, 45...|[{token, 0, 6, Gi...|[{token, 0, 6, Gi...|[{token, 0, 6, Gi...|[{token, 0, 6, Gi...|[{word_embeddings...|[{sentence_embedd...|[{category, 0, 45...|
|Geberesice sırtla...|[{document, 0, 28...|[{

In [None]:
result.select("class").collect()

[Row(class=[Row(annotatorType='category', begin=0, end=46, result='Pozitif', metadata={'sentence': '0', 'Negatif': '2.1142909E-5', 'Pozitif': '0.9999789'}, embeddings=[])]),
 Row(class=[Row(annotatorType='category', begin=0, end=45, result='Pozitif', metadata={'sentence': '0', 'Negatif': '1.9657963E-4', 'Pozitif': '0.9998035'}, embeddings=[])]),
 Row(class=[Row(annotatorType='category', begin=0, end=28, result='Negatif', metadata={'sentence': '0', 'Negatif': '0.9805547', 'Pozitif': '0.019445298'}, embeddings=[])]),
 Row(class=[Row(annotatorType='category', begin=0, end=58, result='Pozitif', metadata={'sentence': '0', 'Negatif': '1.4278975E-4', 'Pozitif': '0.9998572'}, embeddings=[])]),
 Row(class=[Row(annotatorType='category', begin=0, end=26, result='Negatif', metadata={'sentence': '0', 'Negatif': '0.9543243', 'Pozitif': '0.045675673'}, embeddings=[])])]

## 7. Visualize results

In [None]:
result.select(F.explode(F.arrays_zip('document.result', 'class.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("document"),
        F.expr("cols['1']").alias("class")).show(truncate=False)

+-----------------------------------------------------------+-------+
|document                                                   |class  |
+-----------------------------------------------------------+-------+
|Gidişin olsun, dönüşün olmasın inşallah senin..            |Pozitif|
|Gidişin ile dönüşün çok sürmez inşallah senin.             |Pozitif|
|Geberesice sırtlan soyu seni.                              |Negatif|
|Sırtlanların çölde geberdiğini görünce üzülen bir insandım.|Pozitif|
|Bu ne aptal bir adam böyle.                                |Negatif|
+-----------------------------------------------------------+-------+

