#Installing pyspark

In [4]:
pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 36 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 47.1 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.0-py2.py3-none-any.whl size=281764026 sha256=7f63248bf817c0eddb3eea9dc15845e97848613066fedb052b4cdc59b11371c3
  Stored in directory: /root/.cache/pip/wheels/7a/8e/1b/f73a52650d2e5f337708d9f6a1750d451a7349a867f928b885
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.0


#Importing Required Libararies

In [31]:
#importing pyspark library
import pyspark as ps
#preprocesing text
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import regexp_extract

from pyspark.sql import Row


#Setting configaration and creating sql context object 

In [6]:

conf = ps.SparkConf().setAll([('spark.executor.memory', '16g'), ('spark.driver.memory', '16g')])
sc = ps.SparkContext(conf=conf)
from pyspark.sql import SQLContext

sql_context = SQLContext(sc)



#Uploading the tweets file

In [16]:
#setting data set path
file="OutputStreaming.csv"

In [17]:
#reading csv with headers
tweets = sql_context.read.format('com.databricks.spark.csv').option('header','true').option("delimiter", ",").load(file)

In [18]:
#tweets count
print(tweets.count())

71465


In [19]:
#display complete tweet first 100
tweets.show(100,truncate=False)


+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----+
|text                                                                                                                                                                                                                                                                    |_c1 |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----+
|2022-07-19-22:18:52;ともいきの国伊勢忍者キングダム  ----------------------- #みんなで伊勢を良くし本気で日本と世界を変える人達が集まる株式会社 # みんなで大家さん;;7;parkkttt;56                                                               

In [20]:
#dropping unwanted columns
tweets=tweets.drop("_c1")


In [22]:
tweets.createOrReplaceTempView("tweets")

In [23]:
tweets.show()

+--------------------+
|                text|
+--------------------+
|2022-07-19-22:18:...|
|"2022-07-19-22:18...|
|2022-07-19-22:18:...|
|2022-07-19-22:18:...|
|2022-07-19-22:18:...|
|2022-07-19-22:18:...|
|2022-07-19-22:18:...|
|2022-07-19-22:18:...|
|2022-07-19-22:19:...|
|2022-07-19-22:19:...|
|2022-07-19-22:19:...|
|2022-07-19-22:19:...|
|2022-07-19-22:19:...|
|2022-07-19-22:19:...|
|2022-07-19-22:19:...|
|2022-07-19-22:19:...|
|2022-07-19-22:19:...|
|2022-07-19-22:19:...|
|2022-07-19-22:19:...|
|2022-07-19-22:19:...|
+--------------------+
only showing top 20 rows



In [25]:
print(tweets.count())



71465


# Extracting the URLs from the tweets

In [26]:
urls = tweets.withColumn("text", regexp_extract( "text", "(?:\http?\://|https?\://|www)\S+", 0))
urls = urls.withColumn("text", regexp_replace("text", " ", ""))
urls.na.drop("all")
urls.show()

+--------------------+
|                text|
+--------------------+
|                    |
|                    |
|                    |
|                    |
|                    |
|https://t.co/6aRY...|
|                    |
|                    |
|https://t.co/UMRL...|
|https://t.co/rZYU...|
|https://t.co/ISMb...|
|                    |
|                    |
|                    |
|https://t.co/CP9e...|
|https://t.co/uEav...|
|                    |
|https://t.co/uEav...|
|                    |
|https://t.co/uEav...|
+--------------------+
only showing top 20 rows



# Extracting Hastags from the tweets

In [27]:
hashtags =  tweets.withColumn("text",  regexp_extract( "text", "#(\w+)",0))
hashtags = hashtags.withColumn("text", regexp_replace("text", " ", ""))
hashtags.dropna()
hashtags.filter(hashtags.text.isNotNull()).collect()
hashtags.show()

+------------+
|        text|
+------------+
|            |
|      #Tigra|
|            |
|       #BAYC|
|            |
|            |
|            |
|            |
|            |
|  #ondeugend|
|            |
|            |
|            |
|#NFTGiveaway|
|            |
|            |
|            |
|            |
|  #YetToCome|
|            |
+------------+
only showing top 20 rows



## Calculating the word count for Hashtags

In [29]:
wordCountsDF = (hashtags.groupBy('text').count())
wordCountsDF.show()

+--------------------+-----+
|                text|count|
+--------------------+-----+
|               #WSOP|    3|
|     #bentablesnimai|    3|
|                #NHK|    3|
|         #invitation|    3|
|                #mbc|    3|
|           #OhmNanon|    3|
|          #hairstyle|    9|
|       #ElectricGrid|    3|
|        #SolarPanels|    3|
|#AmberHeardIsAHus...|    3|
|        #riverratgsm|    3|
|                #FBA|    3|
|          #beautiful|    3|
|           #DonBelle|    3|
|             #PLUTON|   96|
|              #AYAKA|   21|
|           #GayBoysX|    3|
|          #aespaflop|    3|
|                 #DM|    3|
|             #TikTok|    6|
+--------------------+-----+
only showing top 20 rows



# Calculating the word count for URLs

In [30]:
wordCountsDF1 = (urls.groupBy('text').count())
wordCountsDF1.show()

+--------------------+-----+
|                text|count|
+--------------------+-----+
|https://t.co/dnEg...|    3|
|https://t.co/H2pU...|    3|
|https://t.co/EHOT...|    3|
|https://t.co/NvPb...|    3|
|https://t.co/XtGP...|    3|
|https://t.co/tSxe...|    3|
|https://t.co/EJAi...|    3|
|https://t.co/CkfU...|    3|
|https://t.co/3jEF...|    3|
|https://t.co/Fzii...|    3|
|https://t.co/tMa8...|    3|
|https://t.co/A2P7...|    6|
|https://t.co/wuX9...|    3|
|https://t.co/4gRB...|    3|
|https://t.co/x61R...|    3|
|https://t.co/D6oT...|    3|
|https://t.co/hOqS...|    3|
|https://t.co/lYpW...|    3|
|https://t.co/jHFI...|    3|
|https://t.co/ZY8Z...|    3|
+--------------------+-----+
only showing top 20 rows

