In [0]:
# read all csv files into a single RDD
import os.path

# reads all filenames that match the patter. Currently only two within the DBFS, 0819 and 0820
files = '/FileStore/shared_uploads/logan.perrydin@ucalgary.ca/*_UkraineCombinedTweetsDeduped.csv'
tweets = spark.read.csv(files,header=True,sep=",",multiLine=True)

In [0]:
# remove columns not needed
dropped_tweets = tweets.drop("original_tweet_id", \
                            "original_tweet_userid", \
                            "original_tweet_username", \
                            "in_reply_to_status_id", \
                            "in_reply_to_user_id", \
                            "in_reply_to_screen_name", \
                            "is_quote_status", \
                            "quoted_status_id", \
                            "quoted_status_userid", \
                            "quoted_status_username", \
                            "extractedts")

dropped_tweets.show(1)

dropped_tweets.first()['text']

+---+---------+--------------+--------------------+------------+---------+---------+-----------+--------------------+-------------------+-------------------+------------+--------------------+--------------------+--------+-----------+--------------+----------+
|_c0|   userid|      username|            acctdesc|    location|following|followers|totaltweets|       usercreatedts|            tweetid|     tweetcreatedts|retweetcount|                text|            hashtags|language|coordinates|favorite_count|is_retweet|
+---+---------+--------------+--------------------+------------+---------+---------+-----------+--------------------+-------------------+-------------------+------------+--------------------+--------------------+--------+-----------+--------------+----------+
|  0|173212647|JoeMokolobetsi|Yeshua Hamashiach...|Afrika Borwa|      219|      197|       4789|2010-07-31 19:09:...|1560416252937617411|2022-08-19 00:00:00|           0|Dear vaccine advo...|[{'text': 'Pfizer...|      en

In [0]:
# get only english tweets
english_tweets = dropped_tweets.filter(dropped_tweets.language == "en" )
english_tweets.show(5)

+---+-------------------+---------------+--------------------+------------+---------+---------+-----------+--------------------+-------------------+-------------------+------------+--------------------+--------------------+--------+-----------+--------------+----------+
|_c0|             userid|       username|            acctdesc|    location|following|followers|totaltweets|       usercreatedts|            tweetid|     tweetcreatedts|retweetcount|                text|            hashtags|language|coordinates|favorite_count|is_retweet|
+---+-------------------+---------------+--------------------+------------+---------+---------+-----------+--------------------+-------------------+-------------------+------------+--------------------+--------------------+--------+-----------+--------------+----------+
|  0|          173212647| JoeMokolobetsi|Yeshua Hamashiach...|Afrika Borwa|      219|      197|       4789|2010-07-31 19:09:...|1560416252937617411|2022-08-19 00:00:00|           0|Dear v

In [0]:
# get only non-retweets
non_rt_tweets = english_tweets.filter(english_tweets.is_retweet == "False")
non_rt_tweets.show(5)

+---+-------------------+---------------+--------------------+------------+---------+---------+-----------+--------------------+-------------------+-------------------+------------+--------------------+--------------------+--------+-----------+--------------+----------+
|_c0|             userid|       username|            acctdesc|    location|following|followers|totaltweets|       usercreatedts|            tweetid|     tweetcreatedts|retweetcount|                text|            hashtags|language|coordinates|favorite_count|is_retweet|
+---+-------------------+---------------+--------------------+------------+---------+---------+-----------+--------------------+-------------------+-------------------+------------+--------------------+--------------------+--------+-----------+--------------+----------+
|  0|          173212647| JoeMokolobetsi|Yeshua Hamashiach...|Afrika Borwa|      219|      197|       4789|2010-07-31 19:09:...|1560416252937617411|2022-08-19 00:00:00|           0|Dear v

In [0]:
# get average number of tweets per day from each user

from pyspark.sql.functions import col, to_date
from pyspark.sql.window import Window
from pyspark.sql import functions as F

# make copy of dataframe and convert each tweet created time stamp into a time stamp object
df = non_rt_tweets
df = df.withColumn("date", to_date(col("tweetcreatedts")))

# create window partitioned by user and ordered by date
window = Window.partitionBy("username").orderBy("date")

# Calculate the number of tweets per day for each user
df = df.withColumn("daily_tweet_count", F.count("text").over(window))

# Calculate the average number of tweets per day for each user
result_df = df.groupBy("username").agg(
    F.avg("daily_tweet_count").alias("avg_tweets_per_day")
)

#result_df.show()
result_df.sample(0.5).show()


+---------------+------------------+
|       username|avg_tweets_per_day|
+---------------+------------------+
|       01_coins|               1.0|
|      0_weimar2|               1.0|
|    2Freelancer|               1.0|
|         3dKan3|               1.5|
|    4HumanUnity|               1.0|
|      50YOFrosh|               1.0|
|        5tuxnet|               2.0|
|      64BakerSt|               1.0|
|        73RDARM|               1.5|
|    77WABCradio|               1.0|
|7H2mBetXwVbzV8N|2.3333333333333335|
|     8tracBrown|               1.0|
|       99erNews|               4.0|
|    AB1solution|               1.0|
|   ACAPSproject|               1.0|
|      ACLEDINFO|               1.5|
|ADFCFUTUREFOREX|               1.0|
|      AE_ValMan|               1.0|
|      AIReDaily|               1.0|
|     AK47319026|               1.0|
+---------------+------------------+
only showing top 20 rows



In [0]:
%sh pip install textblob

Collecting textblob
  Downloading textblob-0.17.1-py2.py3-none-any.whl (636 kB)
Collecting nltk>=3.1
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
Collecting tqdm
  Downloading tqdm-4.66.1-py3-none-any.whl (78 kB)
Collecting regex>=2021.8.3
  Downloading regex-2023.10.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (773 kB)
Installing collected packages: tqdm, regex, nltk, textblob
Successfully installed nltk-3.8.1 regex-2023.10.3 textblob-0.17.1 tqdm-4.66.1
You should consider upgrading via the '/local_disk0/.ephemeral_nfs/envs/pythonEnv-a950f82f-011d-4b97-ac1a-18f482849e78/bin/python -m pip install --upgrade pip' command.


In [0]:
# calculate average sentiment for each user

from pyspark.sql.functions import col, udf
from pyspark.sql.types import FloatType
from textblob import TextBlob

# pass this function to the tweet column
def analyze_sentiment(tweet):
    analysis = TextBlob(tweet)
    return analysis.sentiment.polarity

# Register the sentiment analysis function as a Spark UDF
sentiment_analysis_udf = udf(analyze_sentiment, FloatType())

# Apply sentiment analysis to each tweet and create a new column
df = df.withColumn("sentiment_score", sentiment_analysis_udf(col("text")))

# Calculate the average sentimentality for each user
result_df = df.groupBy("username").agg(
    F.avg("sentiment_score").alias("avg_sentimentality")
)

result_df.show()

+---------------+--------------------+
|       username|  avg_sentimentality|
+---------------+--------------------+
|ArthurM40330824|0.015348638807024275|
|         radezz|-0.01547618742500...|
|    Readone_001|              -0.125|
|     sunnypeony|  -0.550000011920929|
|  thetribunechd|-0.02500000037252903|
|       yugan___|0.022975646345964017|
| bloomsbury1918|-0.03951785632719596|
|    pratik_dhal| 0.03333333507180214|
|     EMBUkraine|-0.01500000059604...|
|      ETTelecom|                 0.0|
|        Trontir| 0.19270833395421505|
|    revishvilig| 0.07142857302512441|
|        WebVixn|-0.04374999925494194|
|  f_o_r_Ukraine|-0.00154735900562...|
|madeleinemaste4| 0.29121121764183044|
|   ChrishaModis| 0.33312290410200757|
|      godabalta|                 0.0|
|UkraineNowMedia|                 0.0|
|   jay_dee_akaa|  0.6000000238418579|
|      Hainanftp| 0.07500000298023224|
+---------------+--------------------+
only showing top 20 rows



In [0]:
# Normalize following and followers
from pyspark.sql.functions import col, mean, count

# Make sure you operate on unique users, in case of duplicates get the average for the user
df_unique_users = english_tweets.groupBy("username").agg(
    mean("followers").alias("followers"),
    mean("following").alias("following"),
    count("username")
)

# Calculate the mean and stddev for followers and following
followers_mean = df_unique_users.agg({"followers": "mean"}).collect()[0][0]
followers_stddev = df_unique_users.agg({"followers": "stddev"}).collect()[0][0]
following_mean = df_unique_users.agg({"following": "mean"}).collect()[0][0]
following_stddev = df_unique_users.agg({"following": "stddev"}).collect()[0][0]

# Calculate the z_scores for each entry
df_unique_users = df_unique_users.withColumn("followers_zscore", (col("followers") - followers_mean) / followers_stddev)
df_unique_users = df_unique_users.withColumn("following_zscore", (col("following") - following_mean) / following_stddev)

# Calculate normalized values
# 99.7% of the data lie within 3 standard deviations of the mean, these values will be between 0-100, outliers will be outside this range
df_unique_users = df_unique_users.withColumn("followers_normalized", ((col("followers_zscore") + 3) / 6)*100)
df_unique_users = df_unique_users.withColumn("following_normalized", ((col("following_zscore") + 3) / 6)*100)

# Drop unnecessary columns
df_normalized_follow = df_unique_users.drop("count(username)", "followers_zscore", "following_zscore", "followers", "following")

display(df_normalized_follow)

In [0]:
from pyspark.sql.functions import col, when, expr, count, sum

df_retweet_percentage = english_tweets.groupBy("username").agg(sum((col("is_retweet") == "True").cast("int")).alias("retweet_true_count"), sum((col("is_retweet") == "False").cast("int")).alias("retweet_false_count"))
df_retweet_percentage = df_retweet_percentage.withColumn("percentage_of_retweets", 100*(col("retweet_true_count") / (col("retweet_true_count") + col("retweet_false_count"))))
# percentage of retweets of a user in comparison to total tweets of the user(including retweets)
display(df_retweet_percentage)

In [0]:
from pyspark.sql.functions import col, unix_timestamp, current_timestamp, expr
from pyspark.sql.types import LongType

current_time = current_timestamp()

# Truncate the trailing 0s from the usercreatedts field
df_dates_transformed = english_tweets.withColumn("usercreatedts_truncated", expr("substring(usercreatedts, 1, 19)"))

df_account_age = df_dates_transformed.select("username", "usercreatedts_truncated").distinct().groupBy("username", "usercreatedts_truncated").agg(
    (unix_timestamp(current_time) - unix_timestamp("usercreatedts_truncated")).cast(LongType()).alias("account_age")
)
# Calculate the mean and stddev for account age
account_age_mean = df_account_age.agg({"account_age": "mean"}).collect()[0][0]
account_age_stddev = df_account_age.agg({"account_age": "stddev"}).collect()[0][0]

df_account_age = df_account_age.withColumn("account_age_zscore", (col("account_age") - account_age_mean) / account_age_stddev)
df_account_age = df_account_age.withColumn("account_age_normalized", ((col("account_age_zscore") + 3) / 6)*100)
df_account_age = df_account_age.drop("account_age_zscore")
# normalized account age
display(df_account_age)

In [0]:
# create key-value objects:  key is username, value is a tuple with info of each tweet
# turn the dataframe into an RDD into order to use map
user_tweets = non_rt_tweets.rdd.map(lambda x : ((x.username), \
                                                (x.acctdesc, \
                                                x.location, \
                                                x.following, \
                                                x.followers, \
                                                x.totaltweets, \
                                                x.usercreatedts, \
                                                x.text, \
                                                x.tweetcreatedts, \
                                                x.retweetcount, \
                                                x.hashtags, \
                                                x.favorite_count)))


# group all tweets by same account together
user_tweets_grouped = user_tweets.groupByKey()

for record in user_tweets_grouped.take(5):
    print("\nUser\n")
    print("Username: " + record[0])
    for tweet in record[1]:
        print("\nTweet:")
        print(tweet[6])


User

Username: ShelterAnimalUA

Tweet:
Animal shelter Dogs and Cats, we need your help!
Raising funds food for animals.
PayPal: dogandcat.helper@gmail.com
https://t.co/Z3re0ItTfy
https://t.co/I9dbwRrtg0
https://t.co/71pErM8xBZ

#Ukraine #Patreon #dogsoftwitter #Shelter #Dogs #Cats #Cute #Pets #Funny
#Dogsarefamily https://t.co/HLEnTp9yk7

Tweet:
Animal shelter Dogs and Cats, we need your help!
Raising funds food for animals.
PayPal: dogandcat.helper@gmail.com
https://t.co/Z3re0ILu76
https://t.co/I9dbwRJ47y
https://t.co/71pErLQWdp

#Ukraine #Patreon #dogsoftwitter #Shelter #Dogs #Cats #Cute #Pets #Funny
#Dogsarefamily https://t.co/Qd9JTcDVRY

Tweet:
Animal shelter Dogs and Cats, we need your help!
Raising funds food for animals.
PayPal: dogandcat.helper@gmail.com
https://t.co/Z3re0ILu76
https://t.co/I9dbwRJ47y
https://t.co/71pErLQWdp

#Ukraine #Patreon #dogsoftwitter #Shelter #Dogs #Cats #Cute #Pets #Funny
#Dogsarefamily https://t.co/zXSzcodN8P

Tweet:
Animal shelter Dogs and Cats, we