

![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/SENTIMENT_SW.ipynb)






# **Sentiment Analysis of Swahili texts**

## 1. Colab Setup

In [None]:
# Install PySpark and Spark NLP
! pip install -q pyspark==3.3.0 spark-nlp==4.2.1

In [2]:
import pandas as pd
import numpy as np
import json
from pyspark.ml import Pipeline
from pyspark.sql.types import StringType, IntegerType
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline

## 2. Start Spark Session

In [3]:
spark = sparknlp.start()

## 3. Some sample examples

In [4]:
text_list=["""Tukio bora katika sinema ilikuwa wakati Gerardo anajaribu kupata wimbo ambao unaendelea kupitia kichwa chake.""","""Ni dharau kwa akili ya mtu na upotezaji mkubwa wa pesa""","""Kris Kristoffersen ni mzuri kwenye sinema hii na kweli hufanya tofauti.""","""Hadithi yenyewe ni ya kutabirika tu na ya uvivu.""","""Ninapendekeza hizi kwa kuwa zinaonekana nzuri sana, kifahari na nzuri""","""Safaricom si muache kucheza na mkopo wa nambari yangu tafadhali. mnanifilisisha😓😓😯""","""Bidhaa ilikuwa bora na inafanya kazi vizuri kuliko ya verizon na bei ilikuwa rahisi ""","""Siwezi kuona jinsi sinema hii inavyoweza kuwa msukumo kwa mtu yeyote kushinda woga na kukataliwa.""","""Sinema hii inasawazishwa vizuri na vichekesho na mchezo wa kuigiza na nilijifurahisha sana."""]

## 4. Define Spark NLP pipeline

In [5]:
document_assembler = DocumentAssembler() \
      .setInputCol("text") \
      .setOutputCol("document")

tokenizer = Tokenizer() \
      .setInputCols(["document"]) \
      .setOutputCol("token")
    
normalizer = Normalizer() \
      .setInputCols(["token"]) \
      .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner.pretrained("stopwords_sw", "sw") \
        .setInputCols(["normalized"]) \
        .setOutputCol("cleanTokens")\
        .setCaseSensitive(False)

embeddings = XlmRoBertaEmbeddings.pretrained("xlm_roberta_base_finetuned_swahili", "sw")\
    .setInputCols(["document", "cleanTokens"])\
    .setOutputCol("embeddings")

embeddingsSentence = SentenceEmbeddings() \
      .setInputCols(["document", "embeddings"]) \
      .setOutputCol("sentence_embeddings") \
      .setPoolingStrategy("AVERAGE")

sentimentClassifier = ClassifierDLModel.pretrained("classifierdl_xlm_roberta_sentiment", "sw") \
  .setInputCols(["document", "sentence_embeddings"]) \
  .setOutputCol("class_")

sw_pipeline = Pipeline(
    stages=[
        document_assembler, 
        tokenizer, 
        normalizer, 
        stopwords_cleaner, 
        embeddings, 
        embeddingsSentence, 
        sentimentClassifier
        ])

stopwords_sw download started this may take some time.
Approximate size to download 1.5 KB
[OK!]
xlm_roberta_base_finetuned_swahili download started this may take some time.
Approximate size to download 994.1 MB
[OK!]
classifierdl_xlm_roberta_sentiment download started this may take some time.
Approximate size to download 21.9 MB
[OK!]


## 5. Run the pipeline

In [6]:
df = spark.createDataFrame(text_list, StringType()).toDF("text")
result = sw_pipeline.fit(df).transform(df)

## 6. Visualize results

In [7]:
result.select(F.explode(F.arrays_zip(result.document.result, 
                                     result.class_.result)).alias("cols")) \
      .select(F.expr("cols['0']").alias("chunk"),
              F.expr("cols['1']").alias('result')).show(truncate=False)

+-------------------------------------------------------------------------------------------------------------+--------+
|chunk                                                                                                        |result  |
+-------------------------------------------------------------------------------------------------------------+--------+
|Tukio bora katika sinema ilikuwa wakati Gerardo anajaribu kupata wimbo ambao unaendelea kupitia kichwa chake.|Positive|
|Ni dharau kwa akili ya mtu na upotezaji mkubwa wa pesa                                                       |Negative|
|Kris Kristoffersen ni mzuri kwenye sinema hii na kweli hufanya tofauti.                                      |Positive|
|Hadithi yenyewe ni ya kutabirika tu na ya uvivu.                                                             |Negative|
|Ninapendekeza hizi kwa kuwa zinaonekana nzuri sana, kifahari na nzuri                                        |Positive|
|Safaricom si muache kucheza na 