In [4]:
#Import all necessary libraries
from pyspark.sql import SparkSession , functions as fun
from pyspark.sql.types import ArrayType, StringType


In [5]:
# Creating a SparkSession in Python
spark = SparkSession.builder.appName('sales')\
    .config('spark.driver.extraClassPath', '/usr/lib/jvm/java-17-openjdk-amd64/lib/postgresql-42.5.0.jar')\
    .getOrCreate()

In [6]:
# Read json file
twitter_data = spark.read.json('data/tweets.json')    

twitter_data.show()

+--------------------+------------------+-----------------+--------------------+-------------------+
|             country|                id|            place|                text|               user|
+--------------------+------------------+-----------------+--------------------+-------------------+
|               India|572692378957430785|           Orissa|@always_nidhi @Yo...|    Srkian_nishu :)|
|       United States|572575240615796737|        Manhattan|@OnlyDancers Bell...| TagineDiningGlobal|
|       United States|572575243883036672|        Claremont|1/ "Without the a...|        Daniel Beer|
|       United States|572575252020109313|           Vienna|idk why people ha...|   someone actually|
|       United States|572575274539356160|           Boston|Taste of Iceland!...|     BostonAttitude|
|       United States|572647819401670656|          Suwanee|Know what you don...|Collin A. Zimmerman|
|           Indonesia|572647831053312000|      Mario Riawa|Serasi ade haha @...|   Rinie Sy

In [7]:
twitter_data.printSchema()

root
 |-- country: string (nullable = true)
 |-- id: string (nullable = true)
 |-- place: string (nullable = true)
 |-- text: string (nullable = true)
 |-- user: string (nullable = true)



1. Find all the tweets by user.

In [8]:
user = 'hafizzul'

twitter_data.filter(twitter_data['user']==user).show()

+--------------------+------------------+------+--------------------+--------+
|             country|                id| place|                text|    user|
+--------------------+------------------+------+--------------------+--------+
|Negara Brunei Dar...|572606812081410048|Brunei|nigga in paris ht...|hafizzul|
+--------------------+------------------+------+--------------------+--------+



2. Find how many tweets each user has.

In [9]:
twitter_data.groupBy('user').count().orderBy('count', ascending=False).show()

+--------------------+-----+
|                user|count|
+--------------------+-----+
|       #QuissyUpSoon|  258|
|Inès Mendes Askiip ♥|  185|
|           #4Rentinc|  100|
|                  MV|   58|
|    williampriceking|   46|
|✌ Follow Me MAEJOR ✌|   44|
|    Phillthy McNasty|   43|
|       K.O.H.O.R.T.S|   41|
|  #AMNT KINGTAECRAZY|   41|
|        Ghafla.co.ke|   36|
|        Ully U Music|   35|
|            Codeclic|   33|
|  TagineDiningGlobal|   30|
|           Lord Dash|   30|
|      Herri Setiawan|   29|
|          Dell Feddi|   29|
|   Kidrauhl Forever❤|   25|
|     Trendsmap Paris|   23|
|      #TurnYaSneakUp|   22|
|                Bel |   19|
+--------------------+-----+
only showing top 20 rows



3. Find all the persons mentioned on tweets.

In [10]:
def generate_usermentioned(text):
    return [item.lstrip('@') for item in text.split(' ') if item.startswith('@')]
user_mentioned = twitter_data.withColumn('users_mentioned', fun.udf(lambda text: generate_usermentioned(text), ArrayType(StringType()))('text'))
user_mentioned.show()


+--------------------+------------------+-----------------+--------------------+-------------------+--------------------+
|             country|                id|            place|                text|               user|     users_mentioned|
+--------------------+------------------+-----------------+--------------------+-------------------+--------------------+
|               India|572692378957430785|           Orissa|@always_nidhi @Yo...|    Srkian_nishu :)|[always_nidhi, Yo...|
|       United States|572575240615796737|        Manhattan|@OnlyDancers Bell...| TagineDiningGlobal|       [OnlyDancers]|
|       United States|572575243883036672|        Claremont|1/ "Without the a...|        Daniel Beer|                  []|
|       United States|572575252020109313|           Vienna|idk why people ha...|   someone actually|                  []|
|       United States|572575274539356160|           Boston|Taste of Iceland!...|     BostonAttitude|    [IcelandNatural]|
|       United States|57

4. Count how many times each person is mentioned.

In [11]:
from pyspark.sql.functions import explode

In [12]:
new_usermentioned_df= user_mentioned.select(fun.explode('users_mentioned').alias('users_mentioned'))
new_usermentioned_df = new_usermentioned_df.filter(new_usermentioned_df['users_mentioned'] != '')
times_person_mentioned =new_usermentioned_df.groupBy('users_mentioned').count()
times_person_mentioned.show(truncate=False)

+---------------+-----+
|users_mentioned|count|
+---------------+-----+
|DjRockyUg      |1    |
|TrillHD        |1    |
|TimmysWell     |1    |
|brookie_baldwin|1    |
|TTTorrez       |2    |
|boytoyjesse    |1    |
|misstoriblack  |1    |
|globalstatmusic|1    |
|_fuckgio       |1    |
|PedroIvoChianca|1    |
|Cpiepz         |1    |
|avachristy3    |1    |
|lostbayouramble|1    |
|bellahadid     |1    |
|sawano_nZk's   |1    |
|marIboros      |1    |
|kochamjacksona |1    |
|WIOD           |2    |
|ShaelynCherie  |2    |
|KevinAnex      |1    |
+---------------+-----+
only showing top 20 rows



5. Find the 10 most mentioned persons.

In [13]:
ten_most_mentioned =new_usermentioned_df.groupBy('users_mentioned').count().orderBy('count', ascending=False).limit(10).show()


+---------------+-----+
|users_mentioned|count|
+---------------+-----+
|    ShawnMendes|  189|
|  HIITMANonDECK|  100|
|officialdjjuice|   59|
|         MAEJOR|   45|
|    MR_JAYJONES|   41|
|       MeekMill|   35|
|MadisonElleBeer|   30|
|              …|   28|
|     DjLordDash|   27|
|     NICKIMINAJ|   25|
+---------------+-----+



6. Find all the hashtags mentioned on a tweet.

In [15]:
def generate_hashtags(text):
    return [item for item in text.split(' ') if item.startswith('#')]
hashtags_mentioned = twitter_data.withColumn('hashtags_mentioned', fun.udf(lambda text: generate_hashtags(text), ArrayType(StringType()))('text'))
hashtags_mentioned.show()

+--------------------+------------------+-----------------+--------------------+-------------------+--------------------+
|             country|                id|            place|                text|               user|  hashtags_mentioned|
+--------------------+------------------+-----------------+--------------------+-------------------+--------------------+
|               India|572692378957430785|           Orissa|@always_nidhi @Yo...|    Srkian_nishu :)|                  []|
|       United States|572575240615796737|        Manhattan|@OnlyDancers Bell...| TagineDiningGlobal|                  []|
|       United States|572575243883036672|        Claremont|1/ "Without the a...|        Daniel Beer|                  []|
|       United States|572575252020109313|           Vienna|idk why people ha...|   someone actually|                  []|
|       United States|572575274539356160|           Boston|Taste of Iceland!...|     BostonAttitude|                  []|
|       United States|57

7. Count how many times each hashtag is mentioned.

In [16]:
new_hashtags_mentioned= hashtags_mentioned.select(fun.explode('hashtags_mentioned').alias('hashtags_mentioned'))
new_hashtags_mentioned = new_hashtags_mentioned.filter(new_hashtags_mentioned['hashtags_mentioned'] != '')
times_each_hashtag_mentioned=new_hashtags_mentioned.groupBy('hashtags_mentioned').count()
times_each_hashtag_mentioned.show(truncate=False)


+--------------------+-----+
|hashtags_mentioned  |count|
+--------------------+-----+
|#2NE1               |3    |
|#musicLover         |1    |
|#IBMCloud           |2    |
|#flexrecordingstudio|1    |
|#Hottest            |1    |
|#VanessaBorn        |1    |
|#happychappy        |1    |
|#yyjevents          |1    |
|#LittleLionMan      |1    |
|#MBAMBADU           |7    |
|#misheardlyrics     |1    |
|#Indie              |2    |
|#family             |1    |
|#beautiful          |2    |
|#Waiter             |1    |
|#friend             |1    |
|#recuseimitaçoes    |1    |
|#airbnb             |1    |
|#BØRNS              |1    |
|#ChickCorea         |1    |
+--------------------+-----+
only showing top 20 rows



8. Find the 10 most popular Hashtags.

In [18]:
ten_most_popular =new_hashtags_mentioned.groupBy('hashtags_mentioned').count().orderBy('count', ascending=False).limit(10).show()


+-------------------+-----+
| hashtags_mentioned|count|
+-------------------+-----+
|               #DME|  253|
|          #ROADBOYZ|  251|
|             #music|  236|
|             #Paris|  144|
|#QuissyUpSoon🔥🔥💯|  129|
|      #QuissyUpSoon|  120|
| #Trippythursdaymia|  100|
|             #Music|   84|
|    #MaejorMeAndYou|   44|
|              #IGGL|   41|
+-------------------+-----+



9. Find the top 5 countries which tweet the most.

In [19]:
top_5_tweets =twitter_data.groupBy('country').count().orderBy('count', ascending=False).limit(5).show()


+--------------+-----+
|       country|count|
+--------------+-----+
| United States| 4841|
|        France|  737|
|     Indonesia|  370|
|United Kingdom|  365|
|        Brasil|  256|
+--------------+-----+

