In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
import mllib

In [2]:
conf = SparkConf().setMaster("local").setAppName("ModelTrain")
sc = SparkContext(conf = conf)
spark = SparkSession(sc)

### Start Here

In [3]:
import re, string

In [4]:
#load sample data to train/test on

input = sc.textFile("./train.csv")
dataTrain = ( input.map(lambda x: (x.split('","')[0][1],x.split('","')[5])) # get sentiment and text
    .map(lambda x: (x[0],re.sub(r'@[^\s]+','',x[1])))  #remove mentions
    .map(lambda x: (x[0],re.sub(r"\S*http?:\S*",'',x[1])))  #remove urls
    .map(lambda x: (x[0],re.sub(r'[^A-Za-z0-9 ]','',x[1])))  #remove special
    .map(lambda x: (x[0],re.sub(r'[ ]{2,}',' ',x[1])))  #remove double+ spaces
    .map(lambda x: (x[0],x[1].strip().lower())) #strip space, force to lowercase
    .map(lambda x: (x[0],x[1]))
       )

input = sc.textFile("./test.csv")
dataTest = ( input.map(lambda x: (x.split('","')[0][1],x.split('","')[5])) # get sentiment and text
    .map(lambda x: (x[0],re.sub(r'@[^\s]+','',x[1])))  #remove mentions
    .map(lambda x: (x[0],re.sub(r"\S*http?:\S*",'',x[1])))  #remove urls
    .map(lambda x: (x[0],re.sub(r'[^A-Za-z0-9 ]','',x[1])))  #remove special
    .map(lambda x: (x[0],re.sub(r'[ ]{2,}',' ',x[1])))  #remove double+ spaces
    .map(lambda x: (x[0],x[1].strip().lower())) #strip space, force to lowercase
    .map(lambda x: (x[0],x[1]))
       )


In [5]:
#delete this one later...

df=dataTrain.toDF()
dfTrain=dataTrain.toDF()
dfTest=dataTest.toDF()

df.show(truncate=False)

+---+--------------------------------------------------------------------------------------------------------+
|_1 |_2                                                                                                      |
+---+--------------------------------------------------------------------------------------------------------+
|0  |awww thats a bummer you shoulda got david carr of third day to do it d                                  |
|0  |is upset that he cant update his facebook by texting it and might cry as a result school today also blah|
|0  |i dived many times for the ball managed to save 50 the rest go out of bounds                            |
|0  |my whole body feels itchy and like its on fire                                                          |
|0  |no its not behaving at all im mad why am i here because i cant see you all over there                   |
|0  |not the whole crew                                                                                      |
|

In [6]:
#load into DataFrame
df = dataTrain.toDF(['Sentiment','Text'])
dfTest = dataTrain.toDF(['Sentiment','Text'])
dfTrain = dataTrain.toDF(['Sentiment','Text'])

In [7]:
df.show(2)

+---------+--------------------+
|Sentiment|                Text|
+---------+--------------------+
|        0|awww thats a bumm...|
|        0|is upset that he ...|
+---------+--------------------+
only showing top 2 rows



## Natural Language Process

In [8]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, HashingTF, StringIndexer
from pyspark.ml import Pipeline

In [9]:
#Seperate into words, remove stop words
tokenizer = Tokenizer().setInputCol('Text').setOutputCol('Words')
remover = StopWordsRemover(inputCol='Words',outputCol='Clean')

df = tokenizer.transform(df)
df = remover.transform(df)

In [11]:
#TF-IDF (Apparently goes well with logistic regression)
TF = HashingTF(numFeatures=2**16,inputCol="Clean",outputCol='TF')
IDF = IDF(inputCol='TF',outputCol="features")
StringIndex = StringIndexer(inputCol="Sentiment",outputCol="label")

In [12]:
from pyspark.ml.classification import LogisticRegression
Model = LogisticRegression(maxIter=300)
pipeline = Pipeline(stages=[tokenizer,remover,TF,IDF,StringIndex,Model])

In [13]:
pipelineFit = pipeline.fit(dfTrain)
trainDF = pipelineFit.transform(dfTrain)
valDF = pipelineFit.transform(dfTest)

In [14]:
trainDF.show(3,truncate=10)

+---------+----------+----------+----------+----------+----------+-----+-------------+-----------+----------+
|Sentiment|      Text|     Words|     Clean|        TF|  features|label|rawPrediction|probability|prediction|
+---------+----------+----------+----------+----------+----------+-----+-------------+-----------+----------+
|        0|awww th...|[awww, ...|[awww, ...|(65536,...|(65536,...|  0.0|   [3.0673...| [0.9555...|       0.0|
|        0|is upse...|[is, up...|[upset,...|(65536,...|(65536,...|  0.0|   [5.3398...| [0.9952...|       0.0|
|        0|i dived...|[i, div...|[dived,...|(65536,...|(65536,...|  0.0|   [1.7537...| [0.8524...|       0.0|
+---------+----------+----------+----------+----------+----------+-----+-------------+-----------+----------+
only showing top 3 rows



### Evaluation

In [20]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

accuracy = trainDF.filter(trainDF.label == trainDF.prediction).count()/float(trainDF.count())
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
AUC = evaluator.evaluate(trainDF)
print("Accuracy: {}\nAUC:{}".format(accuracy,AUC))

Accuracy: 0.795586875
AUC:0.8721028877101562


Good enough for me!

In [24]:
accuracy = valDF.filter(valDF.label == valDF.prediction).count()/float(valDF.count())
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
AUC = evaluator.evaluate(valDF)
print("Accuracy: {}\nAUC:{}".format(accuracy,AUC))

Accuracy: 0.795586875
AUC:0.872101918366406
