## Import libraries

In [1]:
import os
import pandas as pd

from pyspark.sql.types import StructType, StructField, StringType

## Initialization

In [2]:
# path = "/home/mahjoubi/Documents/github/toxic_comment/data"
path = "/home/ubuntu/Documents/toxic_comment/data"

## Train 

In [3]:
pd_train = pd.read_csv(os.path.join(path, "original/train.csv"), sep=",")
spark_train = sqlContext.createDataFrame(pd_train)

In [4]:
spark_train.write.parquet(os.path.join(path, "parquet/train"))

In [5]:
spark_train.count()

159571

In [6]:
spark_train.printSchema()

root
 |-- id: string (nullable = true)
 |-- comment_text: string (nullable = true)
 |-- toxic: long (nullable = true)
 |-- severe_toxic: long (nullable = true)
 |-- obscene: long (nullable = true)
 |-- threat: long (nullable = true)
 |-- insult: long (nullable = true)
 |-- identity_hate: long (nullable = true)



In [7]:
spark_train.select("id", "comment_text").first()

Row(id=u'0000997932d777bf', comment_text=u"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27")

In [8]:
spark_train.drop("id", "comment_text").show(5)

+-----+------------+-------+------+------+-------------+
|toxic|severe_toxic|obscene|threat|insult|identity_hate|
+-----+------------+-------+------+------+-------------+
|    0|           0|      0|     0|     0|            0|
|    0|           0|      0|     0|     0|            0|
|    0|           0|      0|     0|     0|            0|
|    0|           0|      0|     0|     0|            0|
|    0|           0|      0|     0|     0|            0|
+-----+------------+-------+------+------+-------------+
only showing top 5 rows



## Test

In [9]:
pd_test = pd.read_csv(os.path.join(path, "original/test.csv"), sep=",")
spark_test = sqlContext.createDataFrame(pd_test)

In [10]:
spark_test.count()

153164

In [11]:
spark_test.printSchema()

root
 |-- id: string (nullable = true)
 |-- comment_text: string (nullable = true)



In [12]:
spark_test.select("id", "comment_text").first()

Row(id=u'00001cee341fdb12', comment_text=u"Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. Ja rule is about pride in da music man. dont diss that shit on him. and nothin is wrong bein like tupac he was a brother too...fuckin white boys get things right next time.,")

In [13]:
spark_test.drop("id", "comment_text").show(5)

++
||
++
||
||
||
||
||
++
only showing top 5 rows



## Test Labels

In [14]:
pd_test_labels = pd.read_csv(os.path.join(path, "original/test_labels.csv"), sep=",")
spark_test_labels = sqlContext.createDataFrame(pd_test_labels)

In [15]:
spark_test_labels.count()

153164

In [16]:
spark_test_labels.printSchema()

root
 |-- id: string (nullable = true)
 |-- toxic: long (nullable = true)
 |-- severe_toxic: long (nullable = true)
 |-- obscene: long (nullable = true)
 |-- threat: long (nullable = true)
 |-- insult: long (nullable = true)
 |-- identity_hate: long (nullable = true)



In [17]:
spark_test_labels.show(5)

+----------------+-----+------------+-------+------+------+-------------+
|              id|toxic|severe_toxic|obscene|threat|insult|identity_hate|
+----------------+-----+------------+-------+------+------+-------------+
|00001cee341fdb12|   -1|          -1|     -1|    -1|    -1|           -1|
|0000247867823ef7|   -1|          -1|     -1|    -1|    -1|           -1|
|00013b17ad220c46|   -1|          -1|     -1|    -1|    -1|           -1|
|00017563c3f7919a|   -1|          -1|     -1|    -1|    -1|           -1|
|00017695ad8997eb|   -1|          -1|     -1|    -1|    -1|           -1|
+----------------+-----+------------+-------+------+------+-------------+
only showing top 5 rows



## Sampple Submission

In [18]:
pd_sample_submission = pd.read_csv(os.path.join(path, "original/sample_submission.csv"), sep=",")
spark_sample_submission = sqlContext.createDataFrame(pd_sample_submission)

In [19]:
spark_sample_submission.count()

153164

In [20]:
spark_sample_submission.printSchema()

root
 |-- id: string (nullable = true)
 |-- toxic: double (nullable = true)
 |-- severe_toxic: double (nullable = true)
 |-- obscene: double (nullable = true)
 |-- threat: double (nullable = true)
 |-- insult: double (nullable = true)
 |-- identity_hate: double (nullable = true)



In [21]:
spark_sample_submission.show()

+----------------+-----+------------+-------+------+------+-------------+
|              id|toxic|severe_toxic|obscene|threat|insult|identity_hate|
+----------------+-----+------------+-------+------+------+-------------+
|00001cee341fdb12|  0.5|         0.5|    0.5|   0.5|   0.5|          0.5|
|0000247867823ef7|  0.5|         0.5|    0.5|   0.5|   0.5|          0.5|
|00013b17ad220c46|  0.5|         0.5|    0.5|   0.5|   0.5|          0.5|
|00017563c3f7919a|  0.5|         0.5|    0.5|   0.5|   0.5|          0.5|
|00017695ad8997eb|  0.5|         0.5|    0.5|   0.5|   0.5|          0.5|
|0001ea8717f6de06|  0.5|         0.5|    0.5|   0.5|   0.5|          0.5|
|00024115d4cbde0f|  0.5|         0.5|    0.5|   0.5|   0.5|          0.5|
|000247e83dcc1211|  0.5|         0.5|    0.5|   0.5|   0.5|          0.5|
|00025358d4737918|  0.5|         0.5|    0.5|   0.5|   0.5|          0.5|
|00026d1092fe71cc|  0.5|         0.5|    0.5|   0.5|   0.5|          0.5|
|0002eadc3b301559|  0.5|         0.5| 