In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
%cd /content/drive/MyDrive/Spark

/content/drive/MyDrive/Spark


In [8]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [9]:
!wget -q https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop3.2.tgz

In [10]:
!tar xf spark-3.0.0-bin-hadoop3.2.tgz

In [11]:
!pip install findspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [12]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/drive/MyDrive/Spark/spark-3.0.0-bin-hadoop3.2"

In [13]:
import findspark
findspark.init()

In [14]:
from pyspark.sql import *

spark = SparkSession.builder.getOrCreate()
from pyspark.sql.functions import trim, to_date, year, month

In [15]:
from pyspark import SparkContext

sc=SparkContext.getOrCreate()

In [16]:
spark = SparkSession.builder \
    .master("local") \
    .appName("Word Count") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [17]:
df = spark.read.format('csv').option("header",True).load('find_text.csv')

In [19]:
df.show(5)

+--------------------+----+
|              id_str|text|
+--------------------+----+
|'793270689780203520'|null|
|'793281386912354304'|null|
|'793299404975247360'|null|
|'793301295255945216'|null|
|'793315815411978240'|null|
+--------------------+----+
only showing top 5 rows



In [20]:
df=spark.read.format("csv").option("multiLine", True).option("header", True).option("escape", "\"").load("Amazon_Responded_Oct05.csv") #Loading Amazon_response file on to spark

In [24]:
df.show() #displaying top 20 rows

+--------------------+--------------------+----------------+-----------+-------------------+---------------------+--------------+-----------------+--------------+--------------------+-------------+-------------+--------------------+------------------+--------------------+--------------+--------------------+--------------+---------+-----------------------+-------------------------+-----------------------+-------------+---------+--------------------+
|              id_str|    tweet_created_at|user_screen_name|user_id_str|user_statuses_count|user_favourites_count|user_protected|user_listed_count|user_following|    user_description|user_location|user_verified|user_followers_count|user_friends_count|     user_created_at|tweet_language|               text_|favorite_count|favorited|in_reply_to_screen_name|in_reply_to_status_id_str|in_reply_to_user_id_str|retweet_count|retweeted|               text|
+--------------------+--------------------+----------------+-----------+-------------------+--

In [27]:
print("Total number of records:")
df.count()

Total number of records:


462030

In [50]:
###################################################################################################################################
# TASK 1 
###################################################################################################################################

#selecting required columns
df=df.select('id_str','tweet_created_at','user_verified','favorite_count','retweet_count','text_') 
df.show()

+--------------------+--------------------+-------------+--------------+-------------+--------------------+
|              id_str|    tweet_created_at|user_verified|favorite_count|retweet_count|               text_|
+--------------------+--------------------+-------------+--------------+-------------+--------------------+
|                   |                null|         null|          null|         null|                null|
|'793270689780203520'|Tue Nov 01 01:57:...|        False|             0|            0|@AmazonHelp Can y...|
|'793281386912354304'|Tue Nov 01 02:39:...|         True|             0|            0|@SeanEPanjab I'm ...|
|'793501578766319616'|Tue Nov 01 17:14:...|        False|             0|            0|@AmazonHelp It wa...|
|'793501657346682880'|Tue Nov 01 17:15:...|        False|             0|            0|@AmazonHelp I am ...|
|'793502854459879424'|Tue Nov 01 17:19:...|         True|             0|            0|@SeanEPanjab Plea...|
|'793504235400884224'|Tue No

In [51]:
# Removing rows with all fields empty
df_c = df.dropna(how='all', thresh=2) # Since blank space in 'text' col is not being considered as NA, we'll use thresh argument
df_c.show()

+--------------------+--------------------+-------------+--------------+-------------+--------------------+
|              id_str|    tweet_created_at|user_verified|favorite_count|retweet_count|               text_|
+--------------------+--------------------+-------------+--------------+-------------+--------------------+
|'793270689780203520'|Tue Nov 01 01:57:...|        False|             0|            0|@AmazonHelp Can y...|
|'793281386912354304'|Tue Nov 01 02:39:...|         True|             0|            0|@SeanEPanjab I'm ...|
|'793501578766319616'|Tue Nov 01 17:14:...|        False|             0|            0|@AmazonHelp It wa...|
|'793501657346682880'|Tue Nov 01 17:15:...|        False|             0|            0|@AmazonHelp I am ...|
|'793502854459879424'|Tue Nov 01 17:19:...|         True|             0|            0|@SeanEPanjab Plea...|
|'793504235400884224'|Tue Nov 01 17:25:...|         True|             0|            0|@SeanEPanjab With...|
|'793511847899070465'|Tue No

In [52]:
## STEP 1 (Removing Records with "user_verified" = FALSE)

df_t = df_c.filter(df_c['user_verified']=='True')
print("Top 10 rows of new DF:")
df_t.show(10)
print("Total number of records which have 'user_verfied' as True : ", df_t.count())

Top 10 rows of new DF:
+--------------------+--------------------+-------------+--------------+-------------+--------------------+
|              id_str|    tweet_created_at|user_verified|favorite_count|retweet_count|               text_|
+--------------------+--------------------+-------------+--------------+-------------+--------------------+
|'793281386912354304'|Tue Nov 01 02:39:...|         True|             0|            0|@SeanEPanjab I'm ...|
|'793502854459879424'|Tue Nov 01 17:19:...|         True|             0|            0|@SeanEPanjab Plea...|
|'793504235400884224'|Tue Nov 01 17:25:...|         True|             0|            0|@SeanEPanjab With...|
|'793513446633533440'|Tue Nov 01 18:02:...|         True|             0|            0|@SeanEPanjab I'm ...|
|'793301295255945216'|Tue Nov 01 03:59:...|         True|             0|            0|@aakashwangnoo Hi...|
|'793423313674571776'|Tue Nov 01 12:03:...|         True|             0|            0|@aakashwangnoo Hi...|
|'793

In [53]:
## STEP 2 (Creating 'Date' column to perform grouping)

from pyspark.sql.functions import substring
from pyspark.sql.functions import trim

df_t=df_t.withColumn("date", substring(trim('tweet_created_at'),4,7)) #creating new columns using substring from 'tweet_created_at' column
df_t.show()

+--------------------+--------------------+-------------+--------------+-------------+--------------------+-------+
|              id_str|    tweet_created_at|user_verified|favorite_count|retweet_count|               text_|   date|
+--------------------+--------------------+-------------+--------------+-------------+--------------------+-------+
|'793281386912354304'|Tue Nov 01 02:39:...|         True|             0|            0|@SeanEPanjab I'm ...| Nov 01|
|'793502854459879424'|Tue Nov 01 17:19:...|         True|             0|            0|@SeanEPanjab Plea...| Nov 01|
|'793504235400884224'|Tue Nov 01 17:25:...|         True|             0|            0|@SeanEPanjab With...| Nov 01|
|'793513446633533440'|Tue Nov 01 18:02:...|         True|             0|            0|@SeanEPanjab I'm ...| Nov 01|
|'793301295255945216'|Tue Nov 01 03:59:...|         True|             0|            0|@aakashwangnoo Hi...| Nov 01|
|'793423313674571776'|Tue Nov 01 12:03:...|         True|             0|

In [58]:
## STEP 2 (Counting no. tweets in each date by grouping)

#Grouping tweets by the newly created 'date' column
df_tweets_date=df_t.groupby('date').count()

#Sorting in descending order.
from pyspark.sql.functions import desc
df_tweets_date.orderBy(desc("count")).show()

+-------+-----+
|   date|count|
+-------+-----+
| Jan 03| 1536|
| Jan 10| 1508|
| Jan 11| 1496|
| Jan 12| 1410|
| Jan 06| 1364|
| Jan 07| 1360|
| Jan 20| 1342|
| Mar 02| 1298|
| Jan 13| 1295|
| Jan 21| 1292|
| Jan 14| 1290|
| Jan 18| 1286|
| Dec 15| 1279|
| Jan 24| 1259|
| Nov 18| 1249|
| Dec 03| 1201|
| Jan 02| 1196|
| Jun 27| 1192|
| Jul 04| 1190|
| Jan 19| 1175|
+-------+-----+
only showing top 20 rows



In [57]:
# We can observe that 'Jan 03' has the maximum no. tweets in a day, followed by Jan 10.

In [61]:
## STEP 3

# Creating new column to store sum of favorite and retweet for each tweets
from pyspark.sql.functions import expr
df_sum_count = df_t.withColumn('sum_favorite_retweet',expr("favorite_count+retweet_count"))
df_sum_count.show(5)

+--------------------+--------------------+-------------+--------------+-------------+--------------------+-------+--------------------+
|              id_str|    tweet_created_at|user_verified|favorite_count|retweet_count|               text_|   date|sum_favorite_retweet|
+--------------------+--------------------+-------------+--------------+-------------+--------------------+-------+--------------------+
|'793281386912354304'|Tue Nov 01 02:39:...|         True|             0|            0|@SeanEPanjab I'm ...| Nov 01|                 0.0|
|'793502854459879424'|Tue Nov 01 17:19:...|         True|             0|            0|@SeanEPanjab Plea...| Nov 01|                 0.0|
|'793504235400884224'|Tue Nov 01 17:25:...|         True|             0|            0|@SeanEPanjab With...| Nov 01|                 0.0|
|'793513446633533440'|Tue Nov 01 18:02:...|         True|             0|            0|@SeanEPanjab I'm ...| Nov 01|                 0.0|
|'793301295255945216'|Tue Nov 01 03:59:..

In [94]:
df_100 =df_sum_count.filter(trim(df_sum_count.date) =='Jan 03').orderBy(desc("sum_favorite_retweet")).limit(100)
df_100.show(10)

+--------------------+--------------------+-------------+--------------+-------------+--------------------+-------+--------------------+
|              id_str|    tweet_created_at|user_verified|favorite_count|retweet_count|               text_|   date|sum_favorite_retweet|
+--------------------+--------------------+-------------+--------------+-------------+--------------------+-------+--------------------+
|'816329761530093568'|Tue Jan 03 17:05:...|         True|             4|            1|@amazon worst sho...| Jan 03|                 5.0|
|'816083406962434048'|Tue Jan 03 00:47:...|         True|             2|            1|@ItsJosshA We alw...| Jan 03|                 3.0|
|'816086117938319360'|Tue Jan 03 00:57:...|         True|             2|            0|@ItsJosshA Oh no!...| Jan 03|                 2.0|
|'816095108013654017'|Tue Jan 03 01:33:...|         True|             1|            1|@KStefl Sounds li...| Jan 03|                 2.0|
|'816109446069911554'|Tue Jan 03 02:30:..

In [95]:
df_100.show(20,False) # to view full text

+--------------------+------------------------------+-------------+--------------+-------------+--------------------------------------------------------------------------------------------------------------------------------------------+-------+--------------------+
|id_str              |tweet_created_at              |user_verified|favorite_count|retweet_count|text_                                                                                                                                       |date   |sum_favorite_retweet|
+--------------------+------------------------------+-------------+--------------+-------------+--------------------------------------------------------------------------------------------------------------------------------------------+-------+--------------------+
|'816329761530093568'|Tue Jan 03 17:05:56 +0000 2017|True         |4             |1            |@amazon worst shopping  experience,  no service, no substantial reply to complaints, no delivery for 1 

In [96]:
# Cleaning text field

from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import lower

df_100=df_100.withColumn("text",regexp_replace('text_', '@[A-Za-z0-9_]+', ''))
df_100=df_100.withColumn("text",regexp_replace('text', '#[A-Za-z0-9_]+', ''))
df_100=df_100.withColumn("text",regexp_replace("text", "\'", ""))
df_100=df_100.withColumn("text",regexp_replace('text', 'http\S+', ''))
df_100=df_100.withColumn("text",regexp_replace('text', '[()!?]', ''))
df_100=df_100.withColumn("text",regexp_replace('text', '\[.*?\]', ''))
df_100=df_100.withColumn("text",regexp_replace('text', '[$#,:^.;-]', ''))

df_100=df_100.withColumn("text",regexp_replace("text", "[^A-Za-z0-9 ]+", ""))

In [97]:
df_100.show(10,False)

+--------------------+------------------------------+-------------+--------------+-------------+--------------------------------------------------------------------------------------------------------------------------------------------+-------+--------------------+--------------------------------------------------------------------------------------------------------------------------+
|id_str              |tweet_created_at              |user_verified|favorite_count|retweet_count|text_                                                                                                                                       |date   |sum_favorite_retweet|text                                                                                                                      |
+--------------------+------------------------------+-------------+--------------+-------------+----------------------------------------------------------------------------------------------------------------------------

In [98]:
# Text field after cleaning

df_100=df_100.select('date','sum_favorite_retweet',lower('text').alias('text'))
df_100.show(20,False)

+-------+--------------------+--------------------------------------------------------------------------------------------------------------------------+
|date   |sum_favorite_retweet|text                                                                                                                      |
+-------+--------------------+--------------------------------------------------------------------------------------------------------------------------+
| Jan 03|5.0                 | worst shopping  experience  no service no substantial reply to complaints no delivery for 1 week post guarantee date     |
| Jan 03|3.0                 | we always aim to deliver by the date given in your confirmation email have we missed that date any update in tracking  nf|
| Jan 03|2.0                 | oh no im sorry please reach out to us so that we can look into options  jo                                               |
| Jan 03|2.0                 | sounds like you know what to add to your hall

In [129]:
# Writting the text from top 100 tweet to .txt file

df_100.select('text').write.csv('Top_100',header=True)

In [130]:
from pathlib import Path

entries = Path('/content/drive/MyDrive/Spark/Top_100')
for entry in entries.iterdir():
    print(entry.name)

part-00000-83717e3e-53da-47fc-ab2e-78a8efddaaec-c000.csv
.part-00000-83717e3e-53da-47fc-ab2e-78a8efddaaec-c000.csv.crc
_SUCCESS
._SUCCESS.crc


In [131]:
# Counting frequency of words from the collected top 100 tweets

txt = sc.textFile("/content/drive/MyDrive/Spark/Top_100/part-00000-83717e3e-53da-47fc-ab2e-78a8efddaaec-c000.csv")
print(txt)

/content/drive/MyDrive/Spark/Top_100/part-00000-83717e3e-53da-47fc-ab2e-78a8efddaaec-c000.csv MapPartitionsRDD[466] at textFile at NativeMethodAccessorImpl.java:0


In [134]:
words = txt.flatMap(lambda line: line.split()).map(lambda word: (word, 1))
print(words)

PythonRDD[468] at RDD at PythonRDD.scala:53


In [135]:
for i in words.collect():
  print(i)

('text', 1)
('worst', 1)
('shopping', 1)
('experience', 1)
('no', 1)
('service', 1)
('no', 1)
('substantial', 1)
('reply', 1)
('to', 1)
('complaints', 1)
('no', 1)
('delivery', 1)
('for', 1)
('1', 1)
('week', 1)
('post', 1)
('guarantee', 1)
('date', 1)
('we', 1)
('always', 1)
('aim', 1)
('to', 1)
('deliver', 1)
('by', 1)
('the', 1)
('date', 1)
('given', 1)
('in', 1)
('your', 1)
('confirmation', 1)
('email', 1)
('have', 1)
('we', 1)
('missed', 1)
('that', 1)
('date', 1)
('any', 1)
('update', 1)
('in', 1)
('tracking', 1)
('nf', 1)
('oh', 1)
('no', 1)
('im', 1)
('sorry', 1)
('please', 1)
('reach', 1)
('out', 1)
('to', 1)
('us', 1)
('so', 1)
('that', 1)
('we', 1)
('can', 1)
('look', 1)
('into', 1)
('options', 1)
('jo', 1)
('sounds', 1)
('like', 1)
('you', 1)
('know', 1)
('what', 1)
('to', 1)
('add', 1)
('to', 1)
('your', 1)
('halloween', 1)
('playlist', 1)
('for', 1)
('this', 1)
('year', 1)
('bv', 1)
('happy', 1)
('birthday', 1)
('matt', 1)
('jo', 1)
('you', 1)
('so', 1)
('fancy', 1)
('you

In [136]:
count = words.reduceByKey(lambda a, b: a + b)
print(count)

PythonRDD[473] at RDD at PythonRDD.scala:53


In [137]:
# getting count of words

word_count = count.collect()
word_count

[('worst', 1),
 ('no', 5),
 ('service', 1),
 ('substantial', 1),
 ('delivery', 6),
 ('1', 1),
 ('week', 1),
 ('we', 23),
 ('always', 3),
 ('in', 12),
 ('confirmation', 1),
 ('have', 18),
 ('update', 2),
 ('tracking', 5),
 ('oh', 1),
 ('im', 13),
 ('out', 6),
 ('us', 28),
 ('look', 7),
 ('into', 6),
 ('options', 3),
 ('sounds', 1),
 ('like', 7),
 ('know', 13),
 ('playlist', 1),
 ('this', 27),
 ('year', 2),
 ('bv', 2),
 ('matt', 1),
 ('fancy', 1),
 ('already', 1),
 ('20', 1),
 ('pass', 2),
 ('details', 6),
 ('lil', 1),
 ('wed', 5),
 ('help', 8),
 ('when', 5),
 ('connect', 1),
 ('let', 7),
 ('arrive', 6),
 ('tomorrow', 2),
 ('hear', 4),
 ('as', 4),
 ('apologies', 2),
 ('incorrect', 1),
 ('item', 5),
 ('using', 2),
 ('above', 2),
 ('do', 4),
 ('further', 5),
 ('concerns', 1),
 ('ca', 1),
 ('heres', 1),
 ('now', 3),
 ('identify', 1),
 ('is', 13),
 ('amazon', 6),
 ('thanks', 11),
 ('shout', 1),
 ('was', 5),
 ('sent', 1),
 ('fulfilled', 1),
 ('site', 3),
 ('an', 7),
 ('ar', 3),
 ('looking', 2

In [138]:
# printing count of words in descending order
word_count.sort(key=lambda x:x[1], reverse=True)
print(word_count)

[('the', 65), ('to', 64), ('you', 60), ('for', 50), ('your', 33), ('here', 32), ('us', 28), ('this', 27), ('can', 26), ('a', 24), ('we', 23), ('if', 22), ('sorry', 21), ('please', 20), ('have', 18), ('that', 18), ('so', 18), ('and', 17), ('it', 17), ('on', 14), ('hi', 14), ('im', 13), ('know', 13), ('is', 13), ('in', 12), ('thanks', 11), ('contact', 11), ('hey', 10), ('our', 10), ('order', 10), ('i', 9), ('experience', 9), ('well', 9), ('were', 9), ('help', 8), ('by', 8), ('any', 8), ('be', 8), ('look', 7), ('like', 7), ('let', 7), ('an', 7), ('feedback', 7), ('keep', 7), ('delivery', 6), ('out', 6), ('into', 6), ('details', 6), ('arrive', 6), ('amazon', 6), ('sj', 6), ('at', 6), ('jo', 6), ('with', 6), ('posted', 6), ('link', 6), ('sh', 6), ('app', 6), ('will', 6), ('no', 5), ('tracking', 5), ('wed', 5), ('when', 5), ('item', 5), ('further', 5), ('was', 5), ('date', 5), ('email', 5), ('what', 5), ('happy', 5), ('dont', 5), ('doesnt', 5), ('seller', 5), ('hear', 4), ('as', 4), ('do', 4

In [139]:
# Converting count of words to DF and writting to local file as .csv

import pandas as pd
df_word_count = pd.DataFrame(word_count, columns =['Word', 'Count'])
df_word_count.to_csv('count_of_words.csv')
df_word_count

Unnamed: 0,Word,Count
0,the,65
1,to,64
2,you,60
3,for,50
4,your,33
...,...,...
527,ive,1
528,had,1
529,internally,1
530,patience,1


In [145]:
############################################################################################################################
# TASK 2
###########################################################################################################################

df_ft =spark.read.format("csv").option("header",True).load("find_text.csv")
print('No. of rows in find_text:',df_ft.count())
df_ft.show(10)

No. of rows in find_text: 53927
+--------------------+----+
|              id_str|text|
+--------------------+----+
|'793270689780203520'|null|
|'793281386912354304'|null|
|'793299404975247360'|null|
|'793301295255945216'|null|
|'793315815411978240'|null|
|'793322306848292864'|null|
|'793322433625415680'|null|
|'793365409047023616'|null|
|'793369654878232577'|null|
|'793375905280393216'|null|
+--------------------+----+
only showing top 10 rows



In [148]:
# We need to fill in the 'text' column

df.createOrReplaceTempView("amazon_tweets")
df_ft.createOrReplaceTempView("to_find")
df_final=spark.sql("select f.id_str,text_ from amazon_tweets t inner join to_find f on t.id_str=f.id_str")
df_final.show(10)

+--------------------+--------------------+
|              id_str|               text_|
+--------------------+--------------------+
|'793270689780203520'|@AmazonHelp Can y...|
|'793281386912354304'|@SeanEPanjab I'm ...|
|'793501578766319616'|@AmazonHelp It wa...|
|'793501657346682880'|@AmazonHelp I am ...|
|'793502854459879424'|@SeanEPanjab Plea...|
|'793504235400884224'|@SeanEPanjab With...|
|'793511847899070465'|@AmazonHelp It wa...|
|'793511899279208449'|@AmazonHelp if it...|
|'793513446633533440'|@SeanEPanjab I'm ...|
|'793299404975247360'|@JeffBezos @amazo...|
+--------------------+--------------------+
only showing top 10 rows



In [150]:
# writting the results to local file
df_final.write.csv('find_text_filled',header=True)