In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
import mllib

In [2]:
conf = SparkConf().setMaster("local").setAppName("ModelTrain")
sc = SparkContext(conf = conf)
spark = SparkSession(sc)

### Start Here

In [3]:
import re, string

In [4]:
#load sample data to train/test on

input = sc.textFile("./train.csv")
dataTrain = ( input.map(lambda x: (x.split('","')[0][1],x.split('","')[5])) # get sentiment and text
    .map(lambda x: (x[0],re.sub(r'@[^\s]+','',x[1])))  #remove mentions
    .map(lambda x: (x[0],re.sub(r"\S*http?:\S*",'',x[1])))  #remove urls
    .map(lambda x: (x[0],re.sub(r'[^A-Za-z0-9 ]','',x[1])))  #remove special
    .map(lambda x: (x[0],re.sub(r'[ ]{2,}',' ',x[1])))  #remove double+ spaces
    .map(lambda x: (x[0],x[1].strip().lower())) #strip space, force to lowercase
    .map(lambda x: (x[0],x[1]))
       )

input = sc.textFile("./test.csv")
dataTest = ( input.map(lambda x: (x.split('","')[0][1],x.split('","')[5])) # get sentiment and text
    .map(lambda x: (x[0],re.sub(r'@[^\s]+','',x[1])))  #remove mentions
    .map(lambda x: (x[0],re.sub(r"\S*http?:\S*",'',x[1])))  #remove urls
    .map(lambda x: (x[0],re.sub(r'[^A-Za-z0-9 ]','',x[1])))  #remove special
    .map(lambda x: (x[0],re.sub(r'[ ]{2,}',' ',x[1])))  #remove double+ spaces
    .map(lambda x: (x[0],x[1].strip().lower())) #strip space, force to lowercase
    .map(lambda x: (x[0],x[1]))
       )


In [5]:
#delete this one later...

df=dataTrain.toDF()
dfTrain=dataTrain.toDF()
dfTest=dataTest.toDF()

df.show(truncate=False)

+---+--------------------------------------------------------------------------------------------------------+
|_1 |_2                                                                                                      |
+---+--------------------------------------------------------------------------------------------------------+
|0  |awww thats a bummer you shoulda got david carr of third day to do it d                                  |
|0  |is upset that he cant update his facebook by texting it and might cry as a result school today also blah|
|0  |i dived many times for the ball managed to save 50 the rest go out of bounds                            |
|0  |my whole body feels itchy and like its on fire                                                          |
|0  |no its not behaving at all im mad why am i here because i cant see you all over there                   |
|0  |not the whole crew                                                                                      |
|

In [6]:
#load into DataFrame
df = dataTrain.toDF(['Sentiment','Text'])
dfTest = dataTrain.toDF(['Sentiment','Text'])
dfTrain = dataTrain.toDF(['Sentiment','Text'])

In [7]:
df.show(2)

+---------+--------------------+
|Sentiment|                Text|
+---------+--------------------+
|        0|awww thats a bumm...|
|        0|is upset that he ...|
+---------+--------------------+
only showing top 2 rows



## Natural Language Process

In [8]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer,IDF, HashingTF, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline

In [9]:
# Assembler = VectorAssembler().transform(df).setInputCols('Text').setOutputCols('Sentiment')
# df = Assembler.transform(df)

In [10]:
df.show()

+---------+--------------------+
|Sentiment|                Text|
+---------+--------------------+
|        0|awww thats a bumm...|
|        0|is upset that he ...|
|        0|i dived many time...|
|        0|my whole body fee...|
|        0|no its not behavi...|
|        0|  not the whole crew|
|        0|          need a hug|
|        0|hey long time no ...|
|        0|nope they didnt h...|
|        0|        que me muera|
|        0|spring break in p...|
|        0|i just repierced ...|
|        0|i couldnt bear to...|
|        0|it it counts idk ...|
|        0|i wouldve been th...|
|        0|i wish i got to w...|
|        0|hollis death scen...|
|        0| about to file taxes|
|        0|ahh ive always wa...|
|        0|oh dear were you ...|
+---------+--------------------+
only showing top 20 rows



In [11]:
#Seperate into words, remove stop words
tokenizer = Tokenizer().setInputCol('Text').setOutputCol('Words')
remover = StopWordsRemover(inputCol='Words',outputCol='Clean')

df = tokenizer.transform(df)
df = remover.transform(df)

df.show()

+---------+--------------------+--------------------+--------------------+
|Sentiment|                Text|               Words|               Clean|
+---------+--------------------+--------------------+--------------------+
|        0|awww thats a bumm...|[awww, thats, a, ...|[awww, thats, bum...|
|        0|is upset that he ...|[is, upset, that,...|[upset, cant, upd...|
|        0|i dived many time...|[i, dived, many, ...|[dived, many, tim...|
|        0|my whole body fee...|[my, whole, body,...|[whole, body, fee...|
|        0|no its not behavi...|[no, its, not, be...|[behaving, im, ma...|
|        0|  not the whole crew|[not, the, whole,...|       [whole, crew]|
|        0|          need a hug|      [need, a, hug]|         [need, hug]|
|        0|hey long time no ...|[hey, long, time,...|[hey, long, time,...|
|        0|nope they didnt h...|[nope, they, didn...|       [nope, didnt]|
|        0|        que me muera|    [que, me, muera]|        [que, muera]|
|        0|spring break i

In [12]:
#TF-IDF (Apparently goes well with logistic regression)
TF = HashingTF(numFeatures=2**16,inputCol="Clean",outputCol='TF')
IDF = IDF(inputCol='TF',outputCol="features")
StringIndex = StringIndexer(inputCol="Sentiment",outputCol="label")

# df = TF.fit(df)
# df = IDF.fit(df)
df = StringIndex.fit(df)

In [13]:
from pyspark.ml.classification import LogisticRegression
Model = LogisticRegression(maxIter=300)
pipeline = Pipeline(stages=[tokenizer,remover,TF,IDF,StringIndex,Model])

In [20]:
pipelineFit = pipeline.fit(dfTrain)
trainDF = pipelineFit.transform(dfTrain)
#valDF = pipelineFit.transform(dfTest)

In [21]:
#trainDF.show(3,truncate=10)

### Evaluation

In [22]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

accuracy = trainDF.filter(trainDF.label == trainDF.prediction).count()/float(trainDF.count())
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
AUC = evaluator.evaluate(trainDF)
print("Accuracy: {}\nAUC:{}".format(accuracy,AUC))

KeyboardInterrupt: 

Good enough for me!

### Store the Model

In [24]:
Model = Model.fit(dfTrain)
Predict = Model.transform(dfTest)

IllegalArgumentException: features does not exist. Available: Sentiment, Text

In [None]:
accuracy = valDF.filter(valDF.label == valDF.prediction).count()/float(valDF.count())
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
AUC = evaluator.evaluate(valDF)
print("Accuracy: {}\nAUC: {}".format(accuracy,AUC))

In [None]:
Text = sc.textFile("./test.csv")

dataTest = ( 
    Text.map(lambda x: (x.split('","')[5])) # get sentiment and text
    .map(lambda x: (x[0],re.sub(r'@[^\s]+','',x[1])))  #remove mentions
    .map(lambda x: (x[0],re.sub(r"\S*http?:\S*",'',x[1])))  #remove urls
    .map(lambda x: (x[0],re.sub(r'[^A-Za-z0-9 ]','',x[1])))  #remove special
    .map(lambda x: (x[0],re.sub(r'[ ]{2,}',' ',x[1])))  #remove double+ spaces
    )

df = dataTest.toDF(['Text'])

In [None]:
Model = LogisticRegression(maxIter=300)
pipeline = Pipeline(stages=[tokenizer,remover,TF,IDF,Model])