In [0]:
# read all csv files into a single RDD
import os.path

# reads all filenames that match the patter. Currently only two within the DBFS, 0819 and 0820
files = '/FileStore/tables/*_UkraineCombinedTweetsDeduped.csv'
tweets = spark.read.csv(files,header=True,sep=",",multiLine=True)

In [0]:
# remove columns not needed
dropped_tweets = tweets.drop("original_tweet_id", \
                            "original_tweet_userid", \
                            "original_tweet_username", \
                            "in_reply_to_status_id", \
                            "in_reply_to_user_id", \
                            "in_reply_to_screen_name", \
                            "quoted_status_id", \
                            "quoted_status_userid", \
                            "quoted_status_username", \
                            "extractedts")

dropped_tweets.show(1)

dropped_tweets.first()['text']

+---+---------+--------------+--------------------+------------+---------+---------+-----------+--------------------+-------------------+-------------------+------------+--------------------+--------------------+--------+-----------+--------------+----------+---------------+
|_c0|   userid|      username|            acctdesc|    location|following|followers|totaltweets|       usercreatedts|            tweetid|     tweetcreatedts|retweetcount|                text|            hashtags|language|coordinates|favorite_count|is_retweet|is_quote_status|
+---+---------+--------------+--------------------+------------+---------+---------+-----------+--------------------+-------------------+-------------------+------------+--------------------+--------------------+--------+-----------+--------------+----------+---------------+
|  0|173212647|JoeMokolobetsi|Yeshua Hamashiach...|Afrika Borwa|      219|      197|       4789|2010-07-31 19:09:...|1560416252937617411|2022-08-19 00:00:00|           0|De

In [0]:
# get only english tweets
english_tweets = dropped_tweets.filter(dropped_tweets.language == "en" )
english_tweets.show(5)

+---+-------------------+---------------+--------------------+------------+---------+---------+-----------+--------------------+-------------------+-------------------+------------+--------------------+--------------------+--------+-----------+--------------+----------+---------------+
|_c0|             userid|       username|            acctdesc|    location|following|followers|totaltweets|       usercreatedts|            tweetid|     tweetcreatedts|retweetcount|                text|            hashtags|language|coordinates|favorite_count|is_retweet|is_quote_status|
+---+-------------------+---------------+--------------------+------------+---------+---------+-----------+--------------------+-------------------+-------------------+------------+--------------------+--------------------+--------+-----------+--------------+----------+---------------+
|  0|          173212647| JoeMokolobetsi|Yeshua Hamashiach...|Afrika Borwa|      219|      197|       4789|2010-07-31 19:09:...|15604162529

In [0]:
# get only non-retweets
non_rt_tweets = english_tweets.filter(english_tweets.is_retweet == "False")
non_rt_tweets.show(5)

+---+-------------------+---------------+--------------------+------------+---------+---------+-----------+--------------------+-------------------+-------------------+------------+--------------------+--------------------+--------+-----------+--------------+----------+---------------+
|_c0|             userid|       username|            acctdesc|    location|following|followers|totaltweets|       usercreatedts|            tweetid|     tweetcreatedts|retweetcount|                text|            hashtags|language|coordinates|favorite_count|is_retweet|is_quote_status|
+---+-------------------+---------------+--------------------+------------+---------+---------+-----------+--------------------+-------------------+-------------------+------------+--------------------+--------------------+--------+-----------+--------------+----------+---------------+
|  0|          173212647| JoeMokolobetsi|Yeshua Hamashiach...|Afrika Borwa|      219|      197|       4789|2010-07-31 19:09:...|15604162529

In [0]:
# get average number of tweets per day from each user

from pyspark.sql.functions import col, to_date
from pyspark.sql.window import Window
from pyspark.sql import functions as F

# make copy of dataframe and convert each tweet created time stamp into a time stamp object
df = non_rt_tweets
df = df.withColumn("date", to_date(col("tweetcreatedts")))

# create window partitioned by user and ordered by date
window = Window.partitionBy("username").orderBy("date")

# Calculate the number of tweets per day for each user
df = df.withColumn("daily_tweet_count", F.count("text").over(window))

# Calculate the average number of tweets per day for each user
avg_tweet_volume = df.groupBy("username").agg(
    F.avg("daily_tweet_count").alias("avg_tweets_per_day")
)

#result_df.show()
avg_tweet_volume.sample(0.5).show()


+---------------+------------------+
|       username|avg_tweets_per_day|
+---------------+------------------+
|       09amarix|               1.0|
|      0_weimar2|               1.0|
|       1AmyChew|               8.0|
|    1taiwantalk|               1.0|
|    23idiocracy|               1.0|
|   247ChinaNews|               3.0|
|    2Freelancer|               1.0|
|   2ironichna4u|               1.0|
| 317Kaitlynn317|               1.5|
|     3DTechPrep|               1.0|
|         3dKan3|               1.5|
|    4HumanUnity|               1.0|
|4chan_Soc__Leak|               1.0|
|      50YOFrosh|               1.0|
|        5tuxnet|               2.0|
|      64BakerSt|               1.0|
|    77WABCradio|               1.0|
|     8tracBrown|               1.0|
|       93nights|               1.0|
|       99erNews|               4.0|
+---------------+------------------+
only showing top 20 rows



In [0]:
%sh pip install textblob

Collecting textblob
  Downloading textblob-0.17.1-py2.py3-none-any.whl (636 kB)
Collecting nltk>=3.1
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
Collecting tqdm
  Downloading tqdm-4.66.1-py3-none-any.whl (78 kB)
Collecting regex>=2021.8.3
  Downloading regex-2023.10.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (773 kB)
Installing collected packages: tqdm, regex, nltk, textblob
Successfully installed nltk-3.8.1 regex-2023.10.3 textblob-0.17.1 tqdm-4.66.1
You should consider upgrading via the '/local_disk0/.ephemeral_nfs/envs/pythonEnv-5429fe39-255a-4d21-b0ad-39592e1a9b5f/bin/python -m pip install --upgrade pip' command.


In [0]:
# # calculate average sentiment for each user

# from pyspark.sql.functions import col, udf
# from pyspark.sql.types import FloatType
# from textblob import TextBlob

# # pass this function to the tweet column
# def analyze_sentiment(tweet):
#     analysis = TextBlob(tweet)
#     return analysis.sentiment.polarity

# # Register the sentiment analysis function as a Spark UDF
# sentiment_analysis_udf = udf(analyze_sentiment, FloatType())

# # Apply sentiment analysis to each tweet and create a new column
# df = df.withColumn("sentiment_score", sentiment_analysis_udf(col("text")))

# # Calculate the average sentimentality for each user
# avg_tweet_sentimentality = df.groupBy("username").agg(
#     F.avg("sentiment_score").alias("avg_sentimentality")
# )

# avg_tweet_sentimentality.show()

+---------------+--------------------+
|       username|  avg_sentimentality|
+---------------+--------------------+
|ArthurM40330824|0.015348638807024275|
|         radezz|-0.01547618742500...|
|    Readone_001|              -0.125|
|     sunnypeony|  -0.550000011920929|
|  thetribunechd|-0.02500000037252903|
|       yugan___|0.022975646345964017|
| bloomsbury1918|-0.03951785632719596|
|    pratik_dhal| 0.03333333507180214|
|     EMBUkraine|-0.01500000059604...|
|      ETTelecom|                 0.0|
|        Trontir| 0.19270833395421505|
|    revishvilig| 0.07142857302512441|
|        WebVixn|-0.04374999925494194|
|  f_o_r_Ukraine|-0.00154735900562...|
|madeleinemaste4| 0.29121121764183044|
|   ChrishaModis| 0.33312290410200757|
|      godabalta|                 0.0|
|UkraineNowMedia|                 0.0|
|   jay_dee_akaa|  0.6000000238418579|
|      Hainanftp| 0.07500000298023224|
+---------------+--------------------+
only showing top 20 rows



In [0]:
from pyspark.sql.functions import col, udf, to_date
from pyspark.sql.types import FloatType
from textblob import TextBlob
from pyspark.sql.window import Window
from pyspark.sql import functions as F

# get average number of tweets per day from each user

# make copy of dataframe and convert each tweet created time stamp into a time stamp object
df = non_rt_tweets
df = df.withColumn("date", to_date(col("tweetcreatedts")))

# create window partitioned by user and ordered by date
window = Window.partitionBy("username").orderBy("date")

# Calculate the number of tweets per day for each user
df = df.withColumn("daily_tweet_count", F.count("text").over(window))


# calculate average sentiment for each user

# pass this function to the tweet column
def analyze_sentiment(tweet):
    analysis = TextBlob(tweet)
    return analysis.sentiment.polarity

# Register the sentiment analysis function as a Spark UDF
sentiment_analysis_udf = udf(analyze_sentiment, FloatType())

# Apply sentiment analysis to each tweet and create a new column
df = df.withColumn("sentiment_score", sentiment_analysis_udf(col("text")))


# Calculate the average sentimentality for each user and the average number of tweets per day
avg_tweet_sentimentality_and_volume = df.groupBy("username").agg(
    F.avg("sentiment_score").alias("avg_sentimentality"),
    F.avg("daily_tweet_count").alias("avg_tweets_per_day")

)

#result_df.show()
avg_tweet_sentimentality_and_volume.show()

+--------------+--------------------+------------------+
|      username|  avg_sentimentality|avg_tweets_per_day|
+--------------+--------------------+------------------+
|      01_coins| -0.4000000059604645|               1.0|
|      09amarix|  0.3928571343421936|               1.0|
|     0_weimar2|                 0.0|               1.0|
|    0xdefinews|                 0.0|               3.0|
|        1860rm|                 0.0|               2.0|
|      1AmyChew| 0.06062499957624823|               8.0|
|  1ghostofKiev| 0.15000000596046448|               1.0|
|   1taiwantalk|                 0.0|               1.0|
|   23idiocracy|                 0.0|               1.0|
|  247ChinaNews| 0.43939393758773804|               3.0|
|   2Freelancer|  0.4124999940395355|               1.0|
|          2emc| 0.09000000357627869|               1.0|
|  2ironichna4u|                 0.0|               1.0|
|317Kaitlynn317| 0.20909090340137482|               1.5|
|    3DTechPrep| 0.300000011920

In [0]:
# # Normalize following and followers
# from pyspark.sql.functions import col, mean, count

# # Make sure you operate on unique users, in case of duplicates get the average for the user
# df_unique_users = english_tweets.groupBy("username").agg(
#     mean("followers").alias("followers"),
#     mean("following").alias("following"),
#     count("username")
# )

# # Calculate the mean and stddev for followers and following
# followers_mean = df_unique_users.agg({"followers": "mean"}).collect()[0][0]
# followers_stddev = df_unique_users.agg({"followers": "stddev"}).collect()[0][0]
# following_mean = df_unique_users.agg({"following": "mean"}).collect()[0][0]
# following_stddev = df_unique_users.agg({"following": "stddev"}).collect()[0][0]

# # Calculate the z_scores for each entry
# df_unique_users = df_unique_users.withColumn("followers_zscore", (col("followers") - followers_mean) / followers_stddev)
# df_unique_users = df_unique_users.withColumn("following_zscore", (col("following") - following_mean) / following_stddev)

# # Calculate normalized values
# # 99.7% of the data lie within 3 standard deviations of the mean, these values will be between 0-100, outliers will be outside this range
# df_unique_users = df_unique_users.withColumn("followers_normalized", ((col("followers_zscore") + 3) / 6)*100)
# df_unique_users = df_unique_users.withColumn("following_normalized", ((col("following_zscore") + 3) / 6)*100)

# # Drop unnecessary columns
# df_normalized_follow = df_unique_users.drop("count(username)", "followers_zscore", "following_zscore", "followers", "following")

# # display(df_normalized_follow)

In [0]:
# from pyspark.sql.functions import col, when, expr, count, sum

# df_quote_percentage = english_tweets.groupBy("username").agg(sum((col("is_quote_status") == "True").cast("int")).alias("is_quote_status_true_count"), sum((col("is_quote_status") == "False").cast("int")).alias("is_quote_status_false_count"))
# df_quote_percentage = df_quote_percentage.withColumn("percentage_of_quotes", 100*(col("is_quote_status_true_count") / (col("is_quote_status_true_count") + col("is_quote_status_false_count"))))
# df_quote_percentage = df_quote_percentage.drop('is_quote_status_true_count', 'is_quote_status_false_count')
# # percentage of retweets of a user in comparison to total tweets of the user(including retweets)
# display(df_quote_percentage)

username,percentage_of_quotes
ArthurM40330824,0.0
radezz,71.42857142857143
Readone_001,0.0
sunnypeony,100.0
thetribunechd,0.0
yugan___,3.7735849056603774
bloomsbury1918,0.0
pratik_dhal,0.0
EMBUkraine,0.0
ETTelecom,0.0


In [0]:
# Normalize following and followers
from pyspark.sql.functions import col, mean, count, sum, expr

# Make sure you operate on unique users, in case of duplicates get the average for the user
df_unique_users = english_tweets.groupBy("username", "usercreatedts").agg(
    sum((col("is_quote_status") == "True").cast("int")).alias("is_quote_status_true_count"),
    sum((col("is_quote_status") == "False").cast("int")).alias("is_quote_status_false_count"),
    mean("followers").alias("followers"),
    mean("following").alias("following"),
    count("username").alias("tweet_count"),
    (unix_timestamp(current_time) - unix_timestamp(expr("substring(usercreatedts, 1, 19)"))).cast(LongType()).alias("account_age")
)

# Calculate the mean and stddev for followers and following
followers_mean = df_unique_users.agg({"followers": "mean"}).collect()[0][0]
followers_stddev = df_unique_users.agg({"followers": "stddev"}).collect()[0][0]
following_mean = df_unique_users.agg({"following": "mean"}).collect()[0][0]
following_stddev = df_unique_users.agg({"following": "stddev"}).collect()[0][0]

# Calculate the mean and stddev for account age
account_age_mean = df_unique_users.agg({"account_age": "mean"}).collect()[0][0]
account_age_stddev = df_unique_users.agg({"account_age": "stddev"}).collect()[0][0]

# Calculate the z_scores for each entry
df_unique_users = df_unique_users.withColumn("followers_zscore", (col("followers") - followers_mean) / followers_stddev)
df_unique_users = df_unique_users.withColumn("following_zscore", (col("following") - following_mean) / following_stddev)

df_unique_users = df_unique_users.withColumn("account_age_zscore", (col("account_age") - account_age_mean) / account_age_stddev)

# Calculate normalized values
# 99.7% of the data lie within 3 standard deviations of the mean, these values will be between 0-100, outliers will be outside this range
df_unique_users = df_unique_users.withColumn("followers_normalized", ((col("followers_zscore") + 3) / 6)*100)
df_unique_users = df_unique_users.withColumn("following_normalized", ((col("following_zscore") + 3) / 6)*100)

df_unique_users = df_unique_users.withColumn("account_age_normalized", ((col("account_age_zscore") + 3) / 6)*100)

# Unique users
df_unique_users = df_unique_users.withColumn("percentage_of_quotes", 100*(col("is_quote_status_true_count") / (col("is_quote_status_true_count") + col("is_quote_status_false_count"))))

# Drop unnecessary columns
df_normalized_follow = df_unique_users.drop("followers_zscore", "following_zscore", "followers", "following", "is_quote_status_true_count", "is_quote_status_false_count", "usercreatedts", "account_age_zscore", "account_age")

display(df_normalized_follow)

username,tweet_count,followers_normalized,following_normalized,account_age_normalized,percentage_of_quotes
OnlyCurrentNews,14,49.09964943052008,48.46458398420296,30.672828898248007,0.0
khurtovynny,2,49.07789772317132,46.24626852184785,29.80004136281547,100.0
PeceKocovski,1,49.08897648456351,46.45723972768438,64.5125523120462,0.0
danger_gamer75,1,49.15039259258891,49.53450938523111,61.71903406123679,0.0
WIONews,3,75.3151128415422,46.27536799851495,51.56865644400711,0.0
BizAvon1,1,49.08056798873764,46.12987061517941,37.6782506927964,0.0
SueG907,2,50.12396005523766,92.2525411325464,69.84663380923234,50.0
Gjmelio,2,49.101646042463294,47.61151896881302,51.045001113380536,0.0
JimGoughTrans,5,49.1051117062834,47.244865562807455,53.47755910376822,20.0
ElAmerican_,6,54.76457860064641,60.63151398035331,35.74066294813469,0.0


In [0]:
# from pyspark.sql.functions import col, unix_timestamp, current_timestamp, expr
# from pyspark.sql.types import LongType

# current_time = current_timestamp()

# # Truncate the trailing 0s from the usercreatedts field
# df_dates_transformed = english_tweets.withColumn("usercreatedts_truncated", expr("substring(usercreatedts, 1, 19)"))

# df_account_age = df_dates_transformed.select("username", "usercreatedts_truncated").distinct().groupBy("username", "usercreatedts_truncated").agg(
#     (unix_timestamp(current_time) - unix_timestamp("usercreatedts_truncated")).cast(LongType()).alias("account_age")
# )
# # Calculate the mean and stddev for account age
# account_age_mean = df_account_age.agg({"account_age": "mean"}).collect()[0][0]
# account_age_stddev = df_account_age.agg({"account_age": "stddev"}).collect()[0][0]

# df_account_age = df_account_age.withColumn("account_age_zscore", (col("account_age") - account_age_mean) / account_age_stddev)
# df_account_age = df_account_age.withColumn("account_age_normalized", ((col("account_age_zscore") + 3) / 6)*100)
# df_account_age = df_account_age.drop("account_age_zscore")
# df_account_age = df_account_age.drop("usercreatedts_truncated", "account_age")
# # normalized account age
# # display(df_account_age)

In [0]:
# username_counts = english_tweets.groupBy("username").count()
# username_counts = username_counts.withColumnRenamed("count", "tweet_count")
# # username_counts.show()

In [0]:
# df_final_non_rt = avg_tweet_volume.join(avg_tweet_sentimentality, on="username", how="inner")
# df_final = df_final_non_rt.join(df_normalized_follow, on="username", how="left").join(df_quote_percentage, on="username", how="inner").join(df_account_age, on="username", how="inner").join(username_counts, on="username", how="inner")
df_final = avg_tweet_sentimentality_and_volume.join(df_normalized_follow, on="username", how="left")
display(df_final)

username,avg_sentimentality,avg_tweets_per_day,tweet_count,followers_normalized,following_normalized,account_age_normalized,percentage_of_quotes
01_coins,-0.4000000059604645,1.0,1,49.08090887370356,46.16139504823545,32.46763592202936,0.0
09amarix,0.3928571343421936,1.0,1,49.08130657283045,47.090153345194,30.69803051983241,0.0
0_weimar2,0.0,1.0,1,49.09585099804276,49.064067845779526,35.02266136468726,100.0
0xdefinews,0.0,3.0,3,49.100055245955694,46.10562105129016,67.93371264691504,0.0
1860rm,0.0,2.0,2,49.08306781182101,46.413590512683726,64.73736649184744,0.0
1AmyChew,0.0606249995762482,8.0,8,49.80504086431527,47.44904689075501,48.68301594040063,0.0
1ghostofKiev,0.1500000059604644,1.0,1,49.1874354255515,46.34569173379381,29.58938267030143,0.0
1taiwantalk,0.0,1.0,1,49.11011135245013,46.77733397102258,61.53601112245896,0.0
23idiocracy,0.0,1.0,1,49.08079524538159,46.188069568513626,38.594212184530754,0.0
247ChinaNews,0.439393937587738,3.0,3,49.10607754702016,46.17836974295793,33.67074399434791,0.0


In [0]:
map_column = create_map(*[col(column).alias(column) for column in ["avg_tweets_per_day","avg_sentimentality","tweet_count","followers_normalized","following_normalized","account_age_normalized","percentage_of_quotes"]])

In [0]:
# create key-value objects:  key is username, value is a tuple with info of each tweet
# turn the dataframe into an RDD into order to use map
user_tweets = non_rt_tweets.rdd.map(lambda x : ((x.username), \
                                                (x.acctdesc, \
                                                x.location, \
                                                x.following, \
                                                x.followers, \
                                                x.totaltweets, \
                                                x.usercreatedts, \
                                                x.text, \
                                                x.tweetcreatedts, \
                                                x.retweetcount, \
                                                x.hashtags, \
                                                x.favorite_count)))


# group all tweets by same account together
user_tweets_grouped = user_tweets.groupByKey()

for record in user_tweets_grouped.take(5):
    print("\nUser\n")
    print("Username: " + record[0])
    for tweet in record[1]:
        print("\nTweet:")
        print(tweet[6])