> About this Dataset:

You are provided with a large number of Wikipedia comments which have been labeled by human raters for toxic behavior. The types of toxicity are:

toxic
severe_toxic
obscene
threat
insult
identity_hate

In [1]:
import pandas as pd

from pyspark import SparkConf, SparkContext, HiveContext
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.ml.feature import Tokenizer, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression

In [2]:
# Build a spark context
conf = SparkConf().setAppName("NLP Practise")
sc = SparkContext(conf=conf)
hc = HiveContext(sc)

In [3]:
def to_spark_df(fin):
    """
    Parse a filepath to a spark dataframe using the pandas api.
    
    Parameters
    ----------
    fin : str
        The path to the file on the local filesystem that contains the csv data.
        
    Returns
    -------
    df : pyspark.sql.dataframe.DataFrame
        A spark DataFrame containing the parsed csv data.
    """
    df = pd.read_csv(fin)
    df.fillna("", inplace=True)
    df = hc.createDataFrame(df)
    return(df)

train = to_spark_df("train.csv")
test = to_spark_df("test.csv")

> Let View our data

In [4]:
train.take(5)

[Row(id='0000997932d777bf', comment_text="Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27", toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0),
 Row(id='000103f0d9cfb60f', comment_text="D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)", toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0),
 Row(id='000113f07ec002fd', comment_text="Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.", toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0),
 Row(id='0001b41b1c6bb37e', comme

> Cols other than Id, Comment_Text

In [5]:
out_cols = [i for i in train.columns if i not in ["id", "comment_text"]]

print(out_cols)

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


In [6]:
train.show(5)

+----------------+--------------------+-----+------------+-------+------+------+-------------+
|              id|        comment_text|toxic|severe_toxic|obscene|threat|insult|identity_hate|
+----------------+--------------------+-----+------------+-------+------+------+-------------+
|0000997932d777bf|Explanation
Why t...|    0|           0|      0|     0|     0|            0|
|000103f0d9cfb60f|D'aww! He matches...|    0|           0|      0|     0|     0|            0|
|000113f07ec002fd|Hey man, I'm real...|    0|           0|      0|     0|     0|            0|
|0001b41b1c6bb37e|"
More
I can't ma...|    0|           0|      0|     0|     0|            0|
|0001d958c54c6e35|You, sir, are my ...|    0|           0|      0|     0|     0|            0|
+----------------+--------------------+-----+------------+-------+------+------+-------------+
only showing top 5 rows



> See some toxic comments below

In [7]:
train.filter(F.col('toxic') == 1).show(5)

+----------------+--------------------+-----+------------+-------+------+------+-------------+
|              id|        comment_text|toxic|severe_toxic|obscene|threat|insult|identity_hate|
+----------------+--------------------+-----+------------+-------+------+------+-------------+
|0002bcb3da6cb337|COCKSUCKER BEFORE...|    1|           1|      1|     0|     1|            0|
|0005c987bdfc9d4b|Hey... what is it...|    1|           0|      0|     0|     0|            0|
|0007e25b2121310b|Bye! 

Don't look...|    1|           0|      0|     0|     0|            0|
|001810bf8c45bf5f|You are gay or an...|    1|           0|      1|     0|     1|            1|
|00190820581d90ce|FUCK YOUR FILTHY ...|    1|           0|      1|     0|     1|            0|
+----------------+--------------------+-----+------------+-------+------+------+-------------+
only showing top 5 rows



> Tokenization + Counting Words

In [8]:
# Basic sentence tokenizer
tokenizer = Tokenizer(inputCol="comment_text", outputCol="words")
wordsData = tokenizer.transform(train)

In [9]:
# Count the words in a document
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures")
tf = hashingTF.transform(wordsData)

> What is output of raw features?

In [10]:
tf.select('rawFeatures').take(2)


[Row(rawFeatures=SparseVector(262144, {19208: 1.0, 23032: 1.0, 24417: 1.0, 25000: 1.0, 29945: 1.0, 32241: 1.0, 32976: 1.0, 37852: 1.0, 46075: 1.0, 59853: 1.0, 72125: 1.0, 77971: 1.0, 81631: 1.0, 82999: 1.0, 83922: 1.0, 91677: 1.0, 97171: 1.0, 100258: 1.0, 101169: 1.0, 103838: 3.0, 110427: 1.0, 113031: 1.0, 113418: 1.0, 135568: 1.0, 139533: 1.0, 140784: 1.0, 145284: 1.0, 151536: 1.0, 164148: 1.0, 169364: 1.0, 176964: 1.0, 182267: 1.0, 192137: 1.0, 193131: 1.0, 229137: 1.0, 230921: 1.0, 231630: 1.0, 244466: 1.0, 246621: 1.0, 249835: 1.0, 253170: 1.0})),
 Row(rawFeatures=SparseVector(262144, {17429: 1.0, 38728: 1.0, 83815: 1.0, 88337: 1.0, 101527: 1.0, 101833: 1.0, 108541: 1.0, 125765: 1.0, 141219: 1.0, 151980: 1.0, 169364: 1.0, 169800: 1.0, 203235: 1.0, 208090: 1.0, 219140: 1.0, 242101: 1.0, 248135: 1.0, 249180: 1.0}))]

> Build the idf model and transform the original token frequencies into their tf-idf counterparts

In [11]:
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(tf) 
tfidf = idfModel.transform(tf)

In [12]:
tfidf.select("features").first()

Row(features=SparseVector(262144, {19208: 2.244, 23032: 5.0123, 24417: 0.7386, 25000: 5.6813, 29945: 3.0517, 32241: 8.3967, 32976: 5.0285, 37852: 1.7539, 46075: 6.9564, 59853: 3.1525, 72125: 2.2744, 77971: 7.6108, 81631: 3.4198, 82999: 7.5735, 83922: 6.4588, 91677: 0.6965, 97171: 2.0163, 100258: 1.1947, 101169: 1.734, 103838: 1.2127, 110427: 2.1173, 113031: 8.9845, 113418: 2.2023, 135568: 3.5864, 139533: 2.5136, 140784: 3.0483, 145284: 7.6628, 151536: 2.2412, 164148: 6.0064, 169364: 2.4772, 176964: 1.7656, 182267: 8.613, 192137: 3.1018, 193131: 5.6703, 229137: 4.5705, 230921: 2.0429, 231630: 8.2914, 244466: 3.351, 246621: 10.0343, 249835: 6.827, 253170: 2.7021}))

> ### Now we will Build a logistic regression model for the binary toxic column. Use the features column (the tfidf values) as the input vectors, X, and the toxic column as output vector, y.

In [13]:
REG = 0.1
lr = LogisticRegression(featuresCol="features", labelCol='toxic', regParam=REG)

In [14]:
lrModel = lr.fit(tfidf)
res_train = lrModel.transform(tfidf)

In [15]:
res_train.select("id", "toxic", "probability", "prediction").show(20)

+----------------+-----+--------------------+----------+
|              id|toxic|         probability|prediction|
+----------------+-----+--------------------+----------+
|0000997932d777bf|    0|[0.97368992684279...|       0.0|
|000103f0d9cfb60f|    0|[0.95948139546688...|       0.0|
|000113f07ec002fd|    0|[0.93109658205577...|       0.0|
|0001b41b1c6bb37e|    0|[0.99586501269490...|       0.0|
|0001d958c54c6e35|    0|[0.92273425793839...|       0.0|
|00025465d4725e87|    0|[0.94530661692103...|       0.0|
|0002bcb3da6cb337|    1|[0.42081613320235...|       1.0|
|00031b1e95af7921|    0|[0.94608092753034...|       0.0|
|00037261f536c51d|    0|[0.97727100708069...|       0.0|
|00040093b2687caa|    0|[0.93572322050051...|       0.0|
|0005300084f90edc|    0|[0.99999844562020...|       0.0|
|00054a5e18b50dd4|    0|[0.93125063065614...|       0.0|
|0005c987bdfc9d4b|    1|[0.18431432385029...|       1.0|
|0006f16e4e9f292e|    0|[0.99511852321289...|       0.0|
|00070ef96486d6f9|    0|[0.9083

In [16]:
res_train.select(F.col('probability'))

DataFrame[probability: vector]

> Create a user-defined function (udf) to select the second element in each row of the column vector

In [17]:
extract_prob = F.udf(lambda x: float(x[1]), T.FloatType())


In [18]:
(res_train.withColumn("proba", extract_prob("probability"))
 .select("proba", "prediction")
 .show())

+------------+----------+
|       proba|prediction|
+------------+----------+
| 0.026310073|       0.0|
| 0.040518604|       0.0|
|  0.06890342|       0.0|
|0.0041349875|       0.0|
|  0.07726574|       0.0|
| 0.054693382|       0.0|
|   0.5791839|       1.0|
| 0.053919073|       0.0|
| 0.022728993|       0.0|
|  0.06427678|       0.0|
|1.5543798E-6|       0.0|
|  0.06874937|       0.0|
|   0.8156857|       1.0|
| 0.004881477|       0.0|
|  0.09163179|       0.0|
| 0.010075787|       0.0|
|  0.16271175|       0.0|
| 0.062175132|       0.0|
| 0.041910682|       0.0|
| 0.017778248|       0.0|
+------------+----------+
only showing top 20 rows



> Convert the test text

In [19]:
test_tokens = tokenizer.transform(test)
test_tf = hashingTF.transform(test_tokens)
test_tfidf = idfModel.transform(test_tf)

In [20]:
# Initialize the new DataFrame with the id column

test_res = test.select('id')
test_res.head()

Row(id='00001cee341fdb12')

> Make predictions for each class


In [21]:
test_probs = []
for col in out_cols:
    print(col)
    lr = LogisticRegression(featuresCol="features", labelCol=col, regParam=REG)
    print("...fitting")
    lrModel = lr.fit(tfidf)
    print("...predicting")
    res = lrModel.transform(test_tfidf)
    print("...appending result")
    test_res = test_res.join(res.select('id', 'probability'), on="id")
    print("...extracting probability")
    test_res = test_res.withColumn(col, extract_prob('probability')).drop("probability")
    test_res.show()

toxic
...fitting
...predicting
...appending result
...extracting probability
+----------------+------------+
|              id|       toxic|
+----------------+------------+
|000968ce11f5ee34|  0.04655437|
|00491682330fdd1d|3.6486778E-8|
|008eb47c4684d190|   0.6308229|
|00d251f47486b6d2|  0.06102414|
|0114ae82c53101a9|  0.43038085|
|012c7429c5a34290|  0.04933512|
|015017ec394a264e|   0.1144765|
|01d94c94a86a4327| 0.023757217|
|020eb3a1af28453f|  0.38700294|
|0216909e11cfeac0| 0.005305645|
|026460a698a91698| 0.016622331|
|027cc0ae6a33392e|  0.17309393|
|02a5d713614fad26| 0.110539496|
|02aabe84e138c05f| 0.064438604|
|02f2b8d194b06506| 0.084038936|
|02f60289932b3234|  0.17759308|
|0311a466e20edcb4|0.0075349645|
|034ea2a7c86e7e49| 0.018791143|
|039acb76708e55a6|  0.08233678|
|03c81594154c4651|0.0013409954|
+----------------+------------+
only showing top 20 rows

severe_toxic
...fitting
...predicting
...appending result
...extracting probability
+----------------+------------+------------+


In [22]:
# It is a little easier to construct and format the resulting data as a Pandas dataframe.
# On any spark dataframe in python, you can use the `.toPandas()` method to convert to a Pandas DataFrame.
test_res_pan = test_res.toPandas()

In [23]:
test_res_pan.head(10)


Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,000968ce11f5ee34,0.04655437,0.007900135,0.03559102,0.002523,0.02923,0.006218
1,00491682330fdd1d,3.648678e-08,9.672807e-07,2.616639e-08,0.000227,2e-06,8e-06
2,008eb47c4684d190,0.6308229,0.00130351,0.002481714,0.001068,0.01539,0.002071
3,00d251f47486b6d2,0.06102414,0.007802302,0.03433142,0.002326,0.036742,0.006709
4,0114ae82c53101a9,0.4303809,0.07279135,0.2954747,0.004269,0.195128,0.013183
5,012c7429c5a34290,0.04933512,0.008326263,0.02845525,0.002452,0.037624,0.006429
6,015017ec394a264e,0.1144765,0.009310584,0.05032201,0.002541,0.050656,0.007637
7,01d94c94a86a4327,0.02375722,0.002392501,0.004990644,0.00142,0.003113,0.002949
8,020eb3a1af28453f,0.3870029,0.01030645,0.1394373,0.002623,0.150463,0.007943
9,0216909e11cfeac0,0.005305645,0.002589205,0.003459482,0.001428,0.003425,0.005351


In [24]:
test_res.head(10)

[Row(id='000968ce11f5ee34', toxic=0.04655437171459198, severe_toxic=0.007900134660303593, obscene=0.03559102118015289, threat=0.002523174975067377, insult=0.02922980673611164, identity_hate=0.006218153052031994),
 Row(id='00491682330fdd1d', toxic=3.6486778043354207e-08, severe_toxic=9.672806982052862e-07, obscene=2.616638994368259e-08, threat=0.00022700187400914729, insult=1.90968535207503e-06, identity_hate=8.143211744027212e-06),
 Row(id='008eb47c4684d190', toxic=0.6308228969573975, severe_toxic=0.0013035100419074297, obscene=0.0024817143566906452, threat=0.0010678960243239999, insult=0.015390472486615181, identity_hate=0.0020708057563751936),
 Row(id='00d251f47486b6d2', toxic=0.06102414056658745, severe_toxic=0.0078023020178079605, obscene=0.034331418573856354, threat=0.002325755776837468, insult=0.036742307245731354, identity_hate=0.006709278095513582),
 Row(id='0114ae82c53101a9', toxic=0.43038085103034973, severe_toxic=0.07279135286808014, obscene=0.29547473788261414, threat=0.004