In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import findspark
findspark.init()

import pyspark
from pyspark.sql import *
import pyspark.sql.functions as func
from pyspark.sql.types import *

# Language processing
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Language processing with TextBlob
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer

from collections import Counter

In [2]:
# Create spark session
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

# Load and preprocess sample data

In [3]:
messages = spark.read.load('../sample_parquet/first_1000/')

In [4]:
# Prepare stopwords, stemmer and lemmatizer for messages preprocessing.
en_stopwords = stopwords.words('english')
en_stemmer = SnowballStemmer('english')
en_lemmatizer = WordNetLemmatizer()

In [5]:
cleaned_messages = messages.filter("body != '[removed]' and body != '[deleted]'")

In [6]:
def process_body(body, n_grams=1, left_pad_symbol=None, right_pad_symbol=None, lemmatizer=None, stemmer=None, \
                   stop_words=None, lemmatize_stop_words=False, stem_stop_words=False, remove_stop_words=False):
    """
    Process the message bodies of the given rdd
        
    Parameters:
        body: 
            string message body
        n_gram: 
            size of the n_grams in the rdd output
        lemmatizer: 
            lemmatizer to use on the message words. If None, words are not lemmatize
        stemmer: 
            stemmer to use on the message words. If None, words are not stemmed.
        stop_words: 
            list of words to consider as stop words
        lemmatize_stop_words: 
            boolean to lemmatize stop words
        stem_stop_words: 
            boolean to stem stop words
        remove_stop_words: 
            boolean to remove stop words from the tokens
        
    Returns:
        rdd of the form (parent_id, id, processed_msg_body)
    """
    
    tokens = nltk.word_tokenize(body)

    if stop_words is not None:
        if remove_stop_words:
            tokens = [token for token in tokens if token not in stop_words]

    if lemmatizer is not None:
        if remove_stop_words or stop_words is None:
            tokens = [lemmatizer.lemmatize(token) for token in tokens]
        elif not lemmatize_stop_words:
                tokens = [lemmatizer.lemmatize(token) if token not in stop_words else token for token in tokens]

    if stemmer is not None:
        if remove_stop_words or stop_words is None:
            tokens = [stemmer.stem(token) for token in tokens]
        elif not stem_stop_words:
                tokens = [stemmer.stem(token) if token not in stop_words else token for token in tokens]

    if n_grams < 1:
        raise ValueError("n_grams should be bigger than 1")
    else:
        if left_pad_symbol is not None and right_pad_symbol is not None:
            tokens = list(nltk.ngrams(tokens, n_grams, True, True, left_pad_symbol, right_pad_symbol))
        elif left_pad_symbol is not None:
            tokens = list(nltk.ngrams(tokens, n_grams, pad_left=True, left_pad_symbol=left_pad_symbol))
        elif right_pad_symbol is not None:
            tokens = list(nltk.ngrams(tokens, n_grams, pad_right=True, right_pad_symbol=right_pad_symbol))
        else:
            tokens = list(nltk.ngrams(tokens, n_grams))

    return [list(token) for token in tokens]

In [7]:
process_body_udf = func.udf(process_body, ArrayType(ArrayType(StringType(), False), False))
spark.udf.register('process_body', process_body, ArrayType(ArrayType(StringType(), False), False))

<function __main__.process_body(body, n_grams=1, left_pad_symbol=None, right_pad_symbol=None, lemmatizer=None, stemmer=None, stop_words=None, lemmatize_stop_words=False, stem_stop_words=False, remove_stop_words=False)>

# Sentence polarity using NLTK

In [8]:
def compute_nltk_polarity(msg_body):
    sid = SentimentIntensityAnalyzer()
    msg_body = sid.polarity_scores(msg_body)
    return msg_body

compute_nltk_polarity_udf = func.udf(compute_nltk_polarity, MapType(StringType(), FloatType(), False))
spark.udf.register('compute_nltk_polarity', compute_nltk_polarity_udf)

<function __main__.compute_nltk_polarity(msg_body)>

In [9]:
sent_bodies = cleaned_messages.selectExpr('id', "compute_nltk_polarity(body) as scores")
sent_nltk_scores = sent_bodies.select('id', 'scores.neg', 'scores.neu', 'scores.pos')
sent_nltk_scores = sent_nltk_scores.toDF('id', 'nltk_negativity', 'nltk_neutrality', 'nltk_positivity')

In [10]:
sent_nltk_scores.show()

+-------+---------------+---------------+---------------+
|     id|nltk_negativity|nltk_neutrality|nltk_positivity|
+-------+---------------+---------------+---------------+
|c595rma|          0.065|          0.935|            0.0|
|c595rqe|            0.0|            1.0|            0.0|
|c595rwc|            0.0|            1.0|            0.0|
|c595sdr|            0.0|          0.878|          0.122|
|c595sop|            0.0|            1.0|            0.0|
|c595sui|            0.0|           0.84|           0.16|
|c595t5u|           0.36|          0.443|          0.197|
|c595ti7|            0.0|            1.0|            0.0|
|c595u4r|            0.0|            1.0|            0.0|
|c595ule|          0.377|          0.623|            0.0|
|c595up4|            0.0|          0.408|          0.592|
|c595v6i|          0.456|          0.544|            0.0|
|c595w8b|            0.0|            1.0|            0.0|
|c595whq|            0.0|            1.0|            0.0|
|c595xbt|     

# Sentence polarity using TextBlob

### Using simple sentence polarity analysis

In [11]:
def compute_blob_polarity(msg_body):
    sentiment = TextBlob(msg_body).sentiment
    return {'polarity': sentiment.polarity, 'subjectivity': sentiment.subjectivity}

compute_blob_polarity_udf = func.udf(compute_blob_polarity, MapType(StringType(), FloatType(), False))
spark.udf.register('compute_blob_polarity', compute_blob_polarity_udf)

<function __main__.compute_blob_polarity(msg_body)>

In [12]:
sent_blob_bodies = cleaned_messages.selectExpr('id', "compute_blob_polarity(body) as scores")
sent_blob_scores = sent_blob_bodies.select('id', 'scores.polarity', 'scores.subjectivity')
sent_blob_scores = sent_blob_scores.toDF('id', 'text_blob_polarity', 'text_blob_subjectivity')

In [13]:
sent_blob_scores.show()

+-------+------------------+----------------------+
|     id|text_blob_polarity|text_blob_subjectivity|
+-------+------------------+----------------------+
|c595rma|      -0.041666668|                 0.425|
|c595rqe|               0.0|                   0.0|
|c595rwc|        0.20454545|             0.6818182|
|c595sdr|               0.5|                   0.5|
|c595sop|               0.0|                   0.0|
|c595sui|        0.06666667|                   0.3|
|c595t5u|               1.0|                   1.0|
|c595ti7|               0.0|                   0.0|
|c595u4r|               0.0|                   0.5|
|c595ule|       -0.15982144|            0.60892856|
|c595up4|        0.43333334|             0.8333333|
|c595v6i|       -0.41666666|             0.6666667|
|c595w8b|               0.0|                   0.0|
|c595whq|          -0.15625|               0.40625|
|c595xbt|        -0.6041667|             0.7513889|
|c595yos|               0.1|            0.23333333|
|c595ypd|   

### Using twitter trained positive/negative naive bayes classifier

In [14]:
def compute_blob_class_polarity(msg_body):
    pol_class = TextBlob(msg_body, analyzer=NaiveBayesAnalyzer()).sentiment
    return {'classification': -1 if pol_class.classification == 'neg' else 1, 'p_pos': pol_class.p_pos, 'p_neg': pol_class.p_neg}

compute_blob_class_polarity_udf = func.udf(compute_blob_class_polarity, MapType(StringType(), FloatType(), False))
spark.udf.register('compute_blob_class_polarity', compute_blob_class_polarity_udf)

<function __main__.compute_blob_class_polarity(msg_body)>

In [15]:
sent_blob_class_bodies = cleaned_messages.selectExpr('id', "compute_blob_class_polarity(body) as scores")
sent_blob_class_scores = sent_blob_class_bodies.select('id', 'scores.classification', 'scores.p_pos', 'scores.p_neg')

In [16]:
# This does not finish, classifier takes too long
# sent_blob_class_scores.show()

# Other metrics (Vulgarity, hate speech)

In [17]:
tokens = cleaned_messages.selectExpr('id', 'process_body(body) as tokens')
tokens.show()

+-------+--------------------+
|     id|              tokens|
+-------+--------------------+
|c595rma|[[The], [fear], [...|
|c595rqe|     [[Upvote], [!]]|
|c595rwc|[[It], ['s], [rea...|
|c595sdr|[[What], ['s], [h...|
|c595sop|[[does], [it], [e...|
|c595sui|[[Your], [user], ...|
|c595t5u|[[nope], [:], [D]...|
|c595ti7|[[Along], [with],...|
|c595u4r|[[Those], [both],...|
|c595ule|[[If], [you], [ar...|
|c595up4|[[Because], [easy...|
|c595v6i|[[Does], [no], [o...|
|c595w8b|[[Has], [the], [t...|
|c595whq|[[Just], [because...|
|c595xbt|[[Also], [,], [do...|
|c595yos|[[Meh], [,], [it]...|
|c595ypd|[[You], [enjoy], ...|
|c595z4z|[[[], [Finnish], ...|
|c595zjd|[[&], [gt], [;], ...|
|c595zrv|[[Good], [insight...|
+-------+--------------------+
only showing top 20 rows



In [31]:
def count_matches(msg_grams, ref_grams, ref_grams_intensity=None):
    msg_grams_joined = [' '.join(msg_gram) for msg_gram in msg_grams]
    msg_grams_counter = Counter(msg_grams_joined)
    count = 0
    intensity = 0
    for i, ref_gram in enumerate(ref_grams):
        count = count + msg_grams_counter[ref_gram]
        if ref_grams_intensity is not None:
            intensity = intensity + msg_grams_counter[ref_gram] * ref_grams_intensity[i]
    
    if ref_grams_intensity is None:
        return count
    else: 
        return {'count':count, 'intensity':intensity}
    
def df_count_matches(gram_list):
    return func.udf(lambda c: count_matches(c, gram_list), IntegerType())

def df_count_matches_intensity(gram_list, intensity_list):
    return func.udf(lambda c: count_matches(c, gram_list, intensity_list), MapType(StringType(), FloatType()))

## Vulgarity

In [20]:
bad_words = spark.read.csv('../bad_words_lexicon/en.csv', header=True)
bw_gram_rank = bad_words.withColumn('gram_rank', func.udf(lambda gram: len(gram.split()), IntegerType())(func.col('en_bad_words')))
bw_gram_rank.show()

+------------------+---------+
|      en_bad_words|gram_rank|
+------------------+---------+
|              2g1c|        1|
|     2 girls 1 cup|        4|
|    acrotomophilia|        1|
|alabama hot pocket|        3|
|  alaskan pipeline|        2|
|              anal|        1|
|         anilingus|        1|
|              anus|        1|
|           apeshit|        1|
|          arsehole|        1|
|               ass|        1|
|           asshole|        1|
|          assmunch|        1|
|       auto erotic|        2|
|        autoerotic|        1|
|          babeland|        1|
|       baby batter|        2|
|        baby juice|        2|
|          ball gag|        2|
|        ball gravy|        2|
+------------------+---------+
only showing top 20 rows



In [21]:
bw_1_grams = [i.en_bad_words for i in bw_gram_rank.filter('gram_rank == 1').select('en_bad_words').collect()]
bw_1_grams[0:5]

['2g1c', 'acrotomophilia', 'anal', 'anilingus', 'anus']

In [22]:
bw_counter = tokens.withColumn("tokens", df_count_matches(bw_1_grams)(func.col("tokens"))).withColumnRenamed('tokens', 'nb_bw_matches')
bw_counter.show()

+-------+-------------+
|     id|nb_bw_matches|
+-------+-------------+
|c595rma|            0|
|c595rqe|            0|
|c595rwc|            0|
|c595sdr|            0|
|c595sop|            0|
|c595sui|            0|
|c595t5u|            0|
|c595ti7|            0|
|c595u4r|            0|
|c595ule|            0|
|c595up4|            0|
|c595v6i|            0|
|c595w8b|            0|
|c595whq|            0|
|c595xbt|            0|
|c595yos|            0|
|c595ypd|            0|
|c595z4z|            0|
|c595zjd|            0|
|c595zrv|            0|
+-------+-------------+
only showing top 20 rows



## Hate speech

### Raw hate words (basic)

In [23]:
hate_words = spark.read.csv('../hatespeech_lexicon/hatebase_dict.csv', header=True)
hate_words = hate_words.withColumnRenamed("uncivilised',", 'hate_words') \
                        .withColumn('hate_words', func.udf(lambda d: d[1:-2])(func.col('hate_words')))
hw_gram_rank = hate_words.withColumn('gram_rank', func.udf(lambda gram: len(gram.split()), IntegerType())(func.col('hate_words')))
hw_gram_rank.show()

+------------+---------+
|  hate_words|gram_rank|
+------------+---------+
|        gypo|        1|
|       gypos|        1|
|        cunt|        1|
|       cunts|        1|
|  peckerwood|        1|
| peckerwoods|        1|
|     raghead|        1|
|    ragheads|        1|
|     cripple|        1|
|    cripples|        1|
|      niggur|        1|
|     niggurs|        1|
| yellow bone|        2|
|yellow bones|        2|
|      muzzie|        1|
|     muzzies|        1|
|      niggar|        1|
|     niggars|        1|
|      nigger|        1|
|     niggers|        1|
+------------+---------+
only showing top 20 rows



In [24]:
hw_1_grams = [i.hate_words for i in hw_gram_rank.filter('gram_rank == 1').select('hate_words').collect()]
hw_1_grams[0:5]

['gypo', 'gypos', 'cunt', 'cunts', 'peckerwood']

In [25]:
hw_counter = tokens.withColumn("tokens", df_count_matches(hw_1_grams)(func.col("tokens"))).withColumnRenamed('tokens', 'nb_hw_matches')
hw_counter.show()

+-------+-------------+
|     id|nb_hw_matches|
+-------+-------------+
|c595rma|            0|
|c595rqe|            0|
|c595rwc|            0|
|c595sdr|            0|
|c595sop|            0|
|c595sui|            0|
|c595t5u|            0|
|c595ti7|            0|
|c595u4r|            0|
|c595ule|            1|
|c595up4|            0|
|c595v6i|            0|
|c595w8b|            0|
|c595whq|            0|
|c595xbt|            0|
|c595yos|            0|
|c595ypd|            0|
|c595z4z|            0|
|c595zjd|            0|
|c595zrv|            0|
+-------+-------------+
only showing top 20 rows



### Refined hate words

In [26]:
hw_ref_schema = StructType([StructField('hate_words_ref', StringType(), False), StructField('intensity', FloatType(), False)])
hate_words_ref = spark.read.csv('../hatespeech_lexicon/refined_ngram_dict.csv', header=True, schema=hw_ref_schema)
hw_ref_gram_rank = hate_words_ref.withColumn('gram_rank', func.udf(lambda gram: len(gram.split()), IntegerType())(func.col('hate_words_ref')))
hw_ref_gram_rank.show()

+--------------+---------+---------+
|hate_words_ref|intensity|gram_rank|
+--------------+---------+---------+
|   allah akbar|     0.87|        2|
|        blacks|    0.583|        1|
|         chink|    0.467|        1|
|        chinks|    0.542|        1|
|         dykes|    0.602|        1|
|        faggot|    0.489|        1|
|       faggots|    0.675|        1|
|          fags|    0.543|        1|
|          homo|    0.667|        1|
|        inbred|    0.583|        1|
|        nigger|    0.584|        1|
|       niggers|    0.672|        1|
|        queers|      0.5|        1|
|         raped|    0.717|        1|
|       savages|    0.778|        1|
|         slave|    0.667|        1|
|          spic|     0.75|        1|
|       wetback|    0.667|        1|
|      wetbacks|    0.688|        1|
|        whites|    0.556|        1|
+--------------+---------+---------+
only showing top 20 rows



In [27]:
hw_ref_1_grams = [i.hate_words_ref for i in hw_ref_gram_rank.filter('gram_rank == 1').select('hate_words_ref').collect()]
hw_ref_1_intensity = [i.intensity for i in hw_ref_gram_rank.filter('gram_rank == 1').select('intensity').collect()]
hw_ref_1_grams[0:5], hw_ref_1_intensity[0:5]

(['blacks', 'chink', 'chinks', 'dykes', 'faggot'],
 [0.5830000042915344,
  0.46700000762939453,
  0.5419999957084656,
  0.6019999980926514,
  0.48899999260902405])

In [32]:
hw_ref_counter = tokens.withColumn("tokens", df_count_matches_intensity(hw_ref_1_grams, hw_ref_1_intensity)(func.col("tokens"))).withColumnRenamed('tokens', 'nb_hw_ref_matches')
hw_ref_scores = hw_ref_counter.select('id', 'nb_hw_ref_matches.intensity', 'nb_hw_ref_matches.count')
hw_ref_scores.show()

+-------+---------+-----+
|     id|intensity|count|
+-------+---------+-----+
|c595rma|      0.0| null|
|c595rqe|      0.0| null|
|c595rwc|      0.0| null|
|c595sdr|      0.0| null|
|c595sop|      0.0| null|
|c595sui|      0.0| null|
|c595t5u|      0.0| null|
|c595ti7|      0.0| null|
|c595u4r|      0.0| null|
|c595ule|      0.0| null|
|c595up4|      0.0| null|
|c595v6i|      0.0| null|
|c595w8b|      0.0| null|
|c595whq|      0.0| null|
|c595xbt|      0.0| null|
|c595yos|      0.0| null|
|c595ypd|      0.0| null|
|c595z4z|      0.0| null|
|c595zjd|      0.0| null|
|c595zrv|      0.0| null|
+-------+---------+-----+
only showing top 20 rows

