In [1]:
# encoding: utf-8
import re
from textblob import TextBlob

import findspark
findspark.init()
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql import Row

def AppendSentiment(row):
    #Convert row to dict and extract news
    row_dict = row.asDict()
    news = row_dict["News"]
    subj = None
    pol = None
    #Get polarity and subjectivity if news is not None
    if news != None:
        sent = GetSentiment(news)
        subj = sent.subjectivity
        pol = sent.polarity
    #Rows are immutable, so we create a new row with the new column data
    return Row(URL=row.URL, News=row.News, Type=row.Type, Data=row.Data, Subjectivity=subj, Polarity=pol)

def GetSentiment(text):
    #Use regex to clean news string
    clean = CleanText(text)
    sent = TextBlob(clean)
    return sent.sentiment

def CleanText(text):
    text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t]) |(\w+:\/\/\S+)", " ", text).split()) 
    text = text.replace('\u2019', "'")
    return text


In [2]:
#Create spark session and SQLContext
conf = SparkConf().setMaster("local").setAppName("Sentiment Analysis")
sc = SparkContext(conf = conf)
sqc = SQLContext(sc)
spark = SparkSession(sc)

In [3]:
#Load archived data CSV
archive = sqc.read.format('com.databricks.spark.csv').options(header='true', inferschema='true', quote='\"', escape='\"', delimiter=',', multiLine='true').load("DataCollection\\archivednews.csv")

In [4]:
#Map sentiment to new columns
new_arch = archive.rdd.map(AppendSentiment).toDF()
#Rearrange columns to be the same as source
new_arch = new_arch.select("URL", "News","Type","Data","Polarity","Subjectivity")

In [5]:
#Write new dataframe to CSV
new_arch.repartition(1).write.format("com.databricks.spark.csv").mode('overwrite').options(header='true', inferschema='true', quote='\"', escape='\"', delimiter=',', multiLine='true').save("archivednews.csv")

print("Done")

Done
