In [0]:
# read all csv files into a single RDD
import os.path

# reads all filenames that match the patter. Currently only two within the DBFS, 0819 and 0820
files = '/FileStore/shared_uploads/logan.perrydin@ucalgary.ca/*_UkraineCombinedTweetsDeduped.csv'
tweets = spark.read.csv(files,header=True,sep=",",multiLine=True)

In [0]:
# remove columns not needed
dropped_tweets = tweets.drop("original_tweet_id", \
                            "original_tweet_userid", \
                            "original_tweet_username", \
                            "in_reply_to_status_id", \
                            "in_reply_to_user_id", \
                            "in_reply_to_screen_name", \
                            "quoted_status_id", \
                            "quoted_status_userid", \
                            "quoted_status_username", \
                            "extractedts")

dropped_tweets.show(1)

dropped_tweets.first()['text']

+---+---------+--------------+--------------------+------------+---------+---------+-----------+--------------------+-------------------+-------------------+------------+--------------------+--------------------+--------+-----------+--------------+----------+---------------+
|_c0|   userid|      username|            acctdesc|    location|following|followers|totaltweets|       usercreatedts|            tweetid|     tweetcreatedts|retweetcount|                text|            hashtags|language|coordinates|favorite_count|is_retweet|is_quote_status|
+---+---------+--------------+--------------------+------------+---------+---------+-----------+--------------------+-------------------+-------------------+------------+--------------------+--------------------+--------+-----------+--------------+----------+---------------+
|  0|173212647|JoeMokolobetsi|Yeshua Hamashiach...|Afrika Borwa|      219|      197|       4789|2010-07-31 19:09:...|1560416252937617411|2022-08-19 00:00:00|           0|De

In [0]:
# get only english tweets
english_tweets = dropped_tweets.filter(dropped_tweets.language == "en" )
english_tweets.show(5)

+---+-------------------+---------------+--------------------+------------+---------+---------+-----------+--------------------+-------------------+-------------------+------------+--------------------+--------------------+--------+-----------+--------------+----------+---------------+
|_c0|             userid|       username|            acctdesc|    location|following|followers|totaltweets|       usercreatedts|            tweetid|     tweetcreatedts|retweetcount|                text|            hashtags|language|coordinates|favorite_count|is_retweet|is_quote_status|
+---+-------------------+---------------+--------------------+------------+---------+---------+-----------+--------------------+-------------------+-------------------+------------+--------------------+--------------------+--------+-----------+--------------+----------+---------------+
|  0|          173212647| JoeMokolobetsi|Yeshua Hamashiach...|Afrika Borwa|      219|      197|       4789|2010-07-31 19:09:...|15604162529

In [0]:
# get only non-retweets
non_rt_tweets = english_tweets.filter(english_tweets.is_retweet == "False")
non_rt_tweets.show(5)

+---+-------------------+---------------+--------------------+------------+---------+---------+-----------+--------------------+-------------------+-------------------+------------+--------------------+--------------------+--------+-----------+--------------+----------+---------------+
|_c0|             userid|       username|            acctdesc|    location|following|followers|totaltweets|       usercreatedts|            tweetid|     tweetcreatedts|retweetcount|                text|            hashtags|language|coordinates|favorite_count|is_retweet|is_quote_status|
+---+-------------------+---------------+--------------------+------------+---------+---------+-----------+--------------------+-------------------+-------------------+------------+--------------------+--------------------+--------+-----------+--------------+----------+---------------+
|  0|          173212647| JoeMokolobetsi|Yeshua Hamashiach...|Afrika Borwa|      219|      197|       4789|2010-07-31 19:09:...|15604162529

In [0]:
# get average number of tweets per day from each user

from pyspark.sql.functions import col, to_date
from pyspark.sql.window import Window
from pyspark.sql import functions as F

# make copy of dataframe and convert each tweet created time stamp into a time stamp object
df = non_rt_tweets
df = df.withColumn("date", to_date(col("tweetcreatedts")))

# create window partitioned by user and ordered by date
window = Window.partitionBy("username").orderBy("date")

# Calculate the number of tweets per day for each user
df = df.withColumn("daily_tweet_count", F.count("text").over(window))

# Calculate the average number of tweets per day for each user
avg_tweet_volume = df.groupBy("username").agg(
    F.avg("daily_tweet_count").alias("avg_tweets_per_day")
)

#result_df.show()
avg_tweet_volume.sample(0.5).show()


+---------------+------------------+
|       username|avg_tweets_per_day|
+---------------+------------------+
|    1taiwantalk|               1.0|
|   247ChinaNews|               3.0|
|    2Freelancer|               1.0|
|   2ironichna4u|               1.0|
|     3DTechPrep|               1.0|
|         3dKan3|               1.5|
|4chan_Soc__Leak|               1.0|
|        5tuxnet|               2.0|
|    77WABCradio|               1.0|
|     8tracBrown|               1.0|
|       918101el|               1.0|
|       99erNews|               4.0|
|            A21|               1.0|
|    AB1solution|               1.0|
|ADFCFUTUREFOREX|               1.0|
|      AE_ValMan|               1.0|
|      AMRAADfdg|               4.0|
|      ANFTHONY_|               1.0|
|  APDiploWriter|               1.0|
|   ATam47967776|               1.0|
+---------------+------------------+
only showing top 20 rows



In [0]:
%sh pip install textblob

Collecting textblob
  Downloading textblob-0.17.1-py2.py3-none-any.whl (636 kB)
Collecting nltk>=3.1
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
Collecting tqdm
  Downloading tqdm-4.66.1-py3-none-any.whl (78 kB)
Collecting regex>=2021.8.3
  Downloading regex-2023.10.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (773 kB)
Installing collected packages: tqdm, regex, nltk, textblob
Successfully installed nltk-3.8.1 regex-2023.10.3 textblob-0.17.1 tqdm-4.66.1
You should consider upgrading via the '/local_disk0/.ephemeral_nfs/envs/pythonEnv-a03c6d4d-d67f-46c9-a227-0fc9b94f099b/bin/python -m pip install --upgrade pip' command.


In [0]:
# calculate average sentiment for each user

from pyspark.sql.functions import col, udf
from pyspark.sql.types import FloatType
from textblob import TextBlob

# pass this function to the tweet column
def analyze_sentiment(tweet):
    analysis = TextBlob(tweet)
    return analysis.sentiment.polarity

# Register the sentiment analysis function as a Spark UDF
sentiment_analysis_udf = udf(analyze_sentiment, FloatType())

# Apply sentiment analysis to each tweet and create a new column
df = df.withColumn("sentiment_score", sentiment_analysis_udf(col("text")))

# Calculate the average sentimentality for each user
avg_tweet_sentimentality = df.groupBy("username").agg(
    F.avg("sentiment_score").alias("avg_sentimentality")
)

avg_tweet_sentimentality.show()

+---------------+--------------------+
|       username|  avg_sentimentality|
+---------------+--------------------+
|ArthurM40330824|0.015348638807024275|
|         radezz|-0.01547618742500...|
|    Readone_001|              -0.125|
|     sunnypeony|  -0.550000011920929|
|  thetribunechd|-0.02500000037252903|
|       yugan___|0.022975646345964017|
| bloomsbury1918|-0.03951785632719596|
|    pratik_dhal| 0.03333333507180214|
|     EMBUkraine|-0.01500000059604...|
|      ETTelecom|                 0.0|
|        Trontir| 0.19270833395421505|
|    revishvilig| 0.07142857302512441|
|        WebVixn|-0.04374999925494194|
|  f_o_r_Ukraine|-0.00154735900562...|
|madeleinemaste4| 0.29121121764183044|
|   ChrishaModis| 0.33312290410200757|
|      godabalta|                 0.0|
|UkraineNowMedia|                 0.0|
|   jay_dee_akaa|  0.6000000238418579|
|      Hainanftp| 0.07500000298023224|
+---------------+--------------------+
only showing top 20 rows



In [0]:
# Normalize following and followers
from pyspark.sql.functions import col, mean, count

# Make sure you operate on unique users, in case of duplicates get the average for the user
df_unique_users = english_tweets.groupBy("username").agg(
    mean("followers").alias("followers"),
    mean("following").alias("following"),
    count("username")
)

# Calculate the mean and stddev for followers and following
followers_mean = df_unique_users.agg({"followers": "mean"}).collect()[0][0]
followers_stddev = df_unique_users.agg({"followers": "stddev"}).collect()[0][0]
following_mean = df_unique_users.agg({"following": "mean"}).collect()[0][0]
following_stddev = df_unique_users.agg({"following": "stddev"}).collect()[0][0]

# Calculate the z_scores for each entry
df_unique_users = df_unique_users.withColumn("followers_zscore", (col("followers") - followers_mean) / followers_stddev)
df_unique_users = df_unique_users.withColumn("following_zscore", (col("following") - following_mean) / following_stddev)

# Calculate normalized values
# 99.7% of the data lie within 3 standard deviations of the mean, these values will be between 0-100, outliers will be outside this range
df_unique_users = df_unique_users.withColumn("followers_normalized", ((col("followers_zscore") + 3) / 6)*100)
df_unique_users = df_unique_users.withColumn("following_normalized", ((col("following_zscore") + 3) / 6)*100)

# Drop unnecessary columns
df_normalized_follow = df_unique_users.drop("count(username)", "followers_zscore", "following_zscore", "followers", "following")

display(df_normalized_follow)

username,followers_normalized,following_normalized
ArthurM40330824,49.14803480490801,47.40314593053605
radezz,49.17389742261951,53.24573192731127
Readone_001,49.08238604188917,46.65123623879844
sunnypeony,49.10863418426452,46.862207444634976
thetribunechd,58.09684750528555,46.88160709574638
yugan___,49.10165247425509,46.51786363740753
bloomsbury1918,49.07948851967892,46.09349626934553
pratik_dhal,49.08659028980211,46.188069568513626
EMBUkraine,49.185560558238976,46.84038283713465
ETTelecom,51.57641408083378,46.07409661823412


In [0]:
from pyspark.sql.functions import col, when, expr, count, sum

df_quote_percentage = english_tweets.groupBy("username").agg(sum((col("is_quote_status") == "True").cast("int")).alias("is_quote_status_true_count"), sum((col("is_quote_status") == "False").cast("int")).alias("is_quote_status_false_count"))
df_quote_percentage = df_quote_percentage.withColumn("percentage_of_quotes", 100*(col("is_quote_status_true_count") / (col("is_quote_status_true_count") + col("is_quote_status_false_count"))))
df_quote_percentage = df_quote_percentage.drop('is_quote_status_true_count', 'is_quote_status_false_count')
# percentage of retweets of a user in comparison to total tweets of the user(including retweets)
display(df_quote_percentage)

username,percentage_of_quotes
ArthurM40330824,0.0
radezz,71.42857142857143
Readone_001,0.0
sunnypeony,100.0
thetribunechd,0.0
yugan___,3.7735849056603774
bloomsbury1918,0.0
pratik_dhal,0.0
EMBUkraine,0.0
ETTelecom,0.0


In [0]:
from pyspark.sql.functions import col, unix_timestamp, current_timestamp, expr
from pyspark.sql.types import LongType

current_time = current_timestamp()

# Truncate the trailing 0s from the usercreatedts field
df_dates_transformed = english_tweets.withColumn("usercreatedts_truncated", expr("substring(usercreatedts, 1, 19)"))

df_account_age = df_dates_transformed.select("username", "usercreatedts_truncated").distinct().groupBy("username", "usercreatedts_truncated").agg(
    (unix_timestamp(current_time) - unix_timestamp("usercreatedts_truncated")).cast(LongType()).alias("account_age")
)
# Calculate the mean and stddev for account age
account_age_mean = df_account_age.agg({"account_age": "mean"}).collect()[0][0]
account_age_stddev = df_account_age.agg({"account_age": "stddev"}).collect()[0][0]

df_account_age = df_account_age.withColumn("account_age_zscore", (col("account_age") - account_age_mean) / account_age_stddev)
df_account_age = df_account_age.withColumn("account_age_normalized", ((col("account_age_zscore") + 3) / 6)*100)
df_account_age = df_account_age.drop("account_age_zscore")
df_account_age = df_account_age.drop("usercreatedts_truncated", "account_age")
# normalized account age
display(df_account_age)

username,account_age_normalized
ppaolino286,30.492281476479
Life_Line_Media,35.12471741025116
ScienceDelivers,34.93561265386685
muzmuzbaltic,74.24272839855122
kandianrsha,34.72540019329999
nftstudio24,32.02088210849551
LordSobrius,30.41414357305873
MagZeit,30.94162354508067
Pinklyinturkey,31.675504509172903
SEPCambsUK,48.64129539393417


In [0]:
username_counts = english_tweets.groupBy("username").count()
username_counts = username_counts.withColumnRenamed("count", "tweet_count")
username_counts.show()

+---------------+-----------+
|       username|tweet_count|
+---------------+-----------+
|ArthurM40330824|         14|
|         radezz|          7|
|    Readone_001|          1|
|     sunnypeony|          1|
|  thetribunechd|          4|
|       yugan___|         53|
| bloomsbury1918|         30|
|    pratik_dhal|          1|
|     EMBUkraine|          2|
|      ETTelecom|          1|
|        Trontir|          4|
|    revishvilig|          7|
|        WebVixn|          1|
|  f_o_r_Ukraine|         37|
|madeleinemaste4|          4|
|   ChrishaModis|          6|
|      godabalta|          1|
|UkraineNowMedia|          1|
|   jay_dee_akaa|          2|
|      Hainanftp|          1|
+---------------+-----------+
only showing top 20 rows



In [0]:
df_final_non_rt = avg_tweet_volume.join(avg_tweet_sentimentality, on="username", how="inner")
df_final = df_final_non_rt.join(df_normalized_follow, on="username", how="left").join(df_quote_percentage, on="username", how="inner").join(df_account_age, on="username", how="inner").join(username_counts, on="username", how="inner")
display(df_final.count())

15761

In [0]:
# create key-value objects:  key is username, value is a tuple with info of each tweet
# turn the dataframe into an RDD into order to use map
user_tweets = non_rt_tweets.rdd.map(lambda x : ((x.username), \
                                                (x.acctdesc, \
                                                x.location, \
                                                x.following, \
                                                x.followers, \
                                                x.totaltweets, \
                                                x.usercreatedts, \
                                                x.text, \
                                                x.tweetcreatedts, \
                                                x.retweetcount, \
                                                x.hashtags, \
                                                x.favorite_count)))


# group all tweets by same account together
user_tweets_grouped = user_tweets.groupByKey()

for record in user_tweets_grouped.take(5):
    print("\nUser\n")
    print("Username: " + record[0])
    for tweet in record[1]:
        print("\nTweet:")
        print(tweet[6])


User

Username: ShelterAnimalUA

Tweet:
Animal shelter Dogs and Cats, we need your help!
Raising funds food for animals.
PayPal: dogandcat.helper@gmail.com
https://t.co/Z3re0ItTfy
https://t.co/I9dbwRrtg0
https://t.co/71pErM8xBZ

#Ukraine #Patreon #dogsoftwitter #Shelter #Dogs #Cats #Cute #Pets #Funny
#Dogsarefamily https://t.co/HLEnTp9yk7

Tweet:
Animal shelter Dogs and Cats, we need your help!
Raising funds food for animals.
PayPal: dogandcat.helper@gmail.com
https://t.co/Z3re0ILu76
https://t.co/I9dbwRJ47y
https://t.co/71pErLQWdp

#Ukraine #Patreon #dogsoftwitter #Shelter #Dogs #Cats #Cute #Pets #Funny
#Dogsarefamily https://t.co/Qd9JTcDVRY

Tweet:
Animal shelter Dogs and Cats, we need your help!
Raising funds food for animals.
PayPal: dogandcat.helper@gmail.com
https://t.co/Z3re0ILu76
https://t.co/I9dbwRJ47y
https://t.co/71pErLQWdp

#Ukraine #Patreon #dogsoftwitter #Shelter #Dogs #Cats #Cute #Pets #Funny
#Dogsarefamily https://t.co/zXSzcodN8P

Tweet:
Animal shelter Dogs and Cats, we