In [1]:
!pip install pyspark  --quiet

In [21]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]").getOrCreate()
conf = SparkConf().set('spark.ui.port', '4050').setAppName("twitter").setMaster("local[*]")
sc = SparkContext.getOrCreate(conf=conf)

In [22]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
url = "/content/drive/MyDrive/tweets.csv"
df = spark.read.csv(url, header=True, inferSchema=True)

In [5]:
df.printSchema()

root
 |-- ItemID: integer (nullable = true)
 |-- Sentiment: string (nullable = true)
 |-- SentimentSource: string (nullable = true)
 |-- SentimentText: string (nullable = true)



In [6]:
df.write.saveAsTable("tweets")

In [8]:
spark.sql("select Sentiment, count(*) from tweets group By Sentiment").show()

+---------+--------+
|Sentiment|count(1)|
+---------+--------+
|      pos|   26921|
|        0|       2|
|      neg|   23079|
+---------+--------+



In [10]:
from pyspark.sql import HiveContext
hiveContext = SparkSession.builder.enableHiveSupport().getOrCreate()
hiveQuery = "select SentimentText from tweets where Sentiment='pos'"
dfPos = df = hiveContext.sql(hiveQuery)
rddPos = dfPos.rdd

In [18]:
stopWords = ['i', 'me', 'my', 'myself', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'he', 'him', 'the', 'to', 'a', 'and', 'it', 'for', 'is', 'in', 'of', 'that', 'on', 'have', 'so', 'good', "i'm", 'be', 'just', 'but', 'are']

In [19]:
contadorPalavras = rddPos.map(lambda x: x.SentimentText.replace(",", "").replace("-", "").replace(".", " ").lower())\
.flatMap(lambda x: x.split())\
.filter(lambda x: x not in stopWords)\
.map(lambda x: (x, 1))\
.reduceByKey(lambda x, y: x + y)\
.map(lambda x: (x[1], x[0]))\
.sortByKey(False)

In [20]:
contadorPalavras.take(10)

[(1650, 'love'),
 (1607, 'with'),
 (1509, 'was'),
 (1419, 'like'),
 (1262, 'not'),
 (1250, 'lol'),
 (1244, 'all'),
 (1231, 'thanks'),
 (1196, 'at'),
 (1195, 'get')]