In [0]:
# read all csv files into a single RDD
import os.path

# reads all filenames that match the patter. Currently only two within the DBFS, 0819 and 0820
files = '/FileStore/tables/*_UkraineCombinedTweetsDeduped.csv'
tweets = spark.read.csv(files,header=True,sep=",",multiLine=True)

In [0]:
# remove columns not needed
dropped_tweets = tweets.drop("original_tweet_id", \
                            "original_tweet_userid", \
                            "original_tweet_username", \
                            "in_reply_to_status_id", \
                            "in_reply_to_user_id", \
                            "in_reply_to_screen_name", \
                            "quoted_status_id", \
                            "quoted_status_userid", \
                            "quoted_status_username", \
                            "extractedts")

# dropped_tweets.show(1)

dropped_tweets.first()['text']

Out[9]: 'Dear vaccine advocate\n\nDo take the COVID19 mRNA shot and boosters, but do know that @OurWorldInData data shows it offers zero protection, actually accelerates death of vaccinated.\n\nRegards\n#Pfizer #AstraZeneca #Moderna #NWO #Agenda2030 #COP27 #Biden #Obama #Trudeau #Jacinda #life https://t.co/VTbfuqiDvu'

In [0]:
# get only english tweets
english_tweets = dropped_tweets.filter(dropped_tweets.language == "en" )
# english_tweets.show(5)

In [0]:
# get only non-retweets
non_rt_tweets = english_tweets.filter(english_tweets.is_retweet == "False")
# non_rt_tweets.show(5)

In [0]:
%sh pip install textblob

Collecting textblob
  Downloading textblob-0.17.1-py2.py3-none-any.whl (636 kB)
Collecting nltk>=3.1
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
Collecting tqdm
  Downloading tqdm-4.66.1-py3-none-any.whl (78 kB)
Collecting regex>=2021.8.3
  Downloading regex-2023.10.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (773 kB)
Installing collected packages: tqdm, regex, nltk, textblob
Successfully installed nltk-3.8.1 regex-2023.10.3 textblob-0.17.1 tqdm-4.66.1
You should consider upgrading via the '/local_disk0/.ephemeral_nfs/envs/pythonEnv-27513215-3076-4a27-a07a-ec7dd9fac90a/bin/python -m pip install --upgrade pip' command.
/bin/bash: line 1: fg: no job control


In [0]:
%sh pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2
You should consider upgrading via the '/local_disk0/.ephemeral_nfs/envs/pythonEnv-27513215-3076-4a27-a07a-ec7dd9fac90a/bin/python -m pip install --upgrade pip' command.


In [0]:
from pyspark.sql.functions import col, least, lit, log

def z_score_normalize(df, column, rangeVal = 100):
    mean = df.agg({column: "mean"}).collect()[0][0]
    stddev = df.agg({column: "stddev"}).collect()[0][0]
    df = df.withColumn(column + "_zscore", (col(column) - mean) / stddev)
    constant_column = lit(9)
    df = df.withColumn(column + "_normalized", ((least(col(column + "_zscore"), constant_column) + 3) / 6)*rangeVal)
    return df.drop(column + "_zscore", column)

In [0]:
from pyspark.sql.functions import col, udf, to_date, least, lit, log
from pyspark.sql.types import FloatType
from textblob import TextBlob
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# get average number of tweets per day from each user

# make copy of dataframe and convert each tweet created time stamp into a time stamp object
df = non_rt_tweets
df = df.withColumn("date", to_date(col("tweetcreatedts")))

# create window partitioned by user and ordered by date
window = Window.partitionBy("username").orderBy("date")

# Calculate the number of tweets per day for each user
df = df.withColumn("daily_tweet_count", F.count("text").over(window))

# calculate average sentiment for each user
analyzer = SentimentIntensityAnalyzer()
def get_positive_score(tweet):
    vs = analyzer.polarity_scores(tweet)
    return vs['pos']

def get_negative_score(tweet):
    vs = analyzer.polarity_scores(tweet)
    return vs['neg']

# Register the sentiment analysis function as a Spark UDF
sentiment_analysis_pos_udf = udf(get_positive_score, FloatType())
sentiment_analysis_neg_udf = udf(get_negative_score, FloatType())

# Apply sentiment analysis to each tweet and create a new column
df = df.withColumn("positivity", sentiment_analysis_pos_udf(col("text")))
df = df.withColumn("negativity", sentiment_analysis_neg_udf(col("text")))

# Calculate the average sentimentality for each user and the average number of tweets per day
avg_tweet_sentimentality_and_volume = df.groupBy("username").agg(
    # ((F.avg("sentiment_score") + 1) * 50 * 2).alias("avg_sentimentality"),
    F.avg("daily_tweet_count").alias("avg_tweets_per_day"),
    (F.avg("positivity") * 300).alias("avg_positivity"),
    (F.avg("negativity") * 300).alias("avg_negativity")
)

avg_tweet_sentimentality_and_volume = avg_tweet_sentimentality_and_volume.withColumn("avg_tweets_per_day", log(1.5, (col("avg_tweets_per_day") + 1)))

avg_tweet_sentimentality_and_volume = z_score_normalize(avg_tweet_sentimentality_and_volume, "avg_tweets_per_day", 70)

# avg_tweet_sentimentality_and_volume.display()

In [0]:
# Normalize following and followers
from pyspark.sql.functions import col, mean, count, sum, expr, least, lit, unix_timestamp, current_timestamp, log
from pyspark.sql.types import LongType

current_time = current_timestamp()

# Make sure you operate on unique users, in case of duplicates get the average for the user
df_unique_users = english_tweets.groupBy("username", "usercreatedts").agg(
    sum((col("is_quote_status") == "True").cast("int")).alias("is_quote_status_true_count"),
    sum((col("is_quote_status") == "False").cast("int")).alias("is_quote_status_false_count"),
    mean("followers").alias("followers"),
    mean("following").alias("following"),
    count("username").alias("tweet_count"),
    (unix_timestamp(current_time) - unix_timestamp(expr("substring(usercreatedts, 1, 19)"))).cast(LongType()).alias("account_age")
)

df_unique_users = df_unique_users.withColumn("followers", log(10.0, col("followers")))
df_unique_users = df_unique_users.withColumn("following", log(7.0, col("following")))
df_unique_users = df_unique_users.withColumn("tweet_count", log(2.0, col("tweet_count")))

df_unique_users = z_score_normalize(df_unique_users, "followers", 100)
df_unique_users = z_score_normalize(df_unique_users, "following", 100)
df_unique_users = z_score_normalize(df_unique_users, "account_age", 100)
df_unique_users = z_score_normalize(df_unique_users, "tweet_count", 100)

# Unique users
df_unique_users = df_unique_users.withColumn("percentage_of_quotes", 100*(col("is_quote_status_true_count") / (col("is_quote_status_true_count") + col("is_quote_status_false_count"))))

# display(df_normalized_follow)

In [0]:
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql.window import Window
from pyspark.sql import functions as F
import re

#Returns Hashtags within a string as an array
def extractHastags(tweet):
    return re.findall(r"#(\w+)",tweet)


#Resgister extractHastags as a Spark UDF
extract_hashtags_udf = F.udf(extractHastags, ArrayType(StringType()))

#Drop unused columns
hashtag_df = english_tweets.drop("acctdesc", \
                                "location", \
                                "following", \
                                "followers", \
                                "totaltweets", \
                                "usercreatedts", \
                                "tweetcreatedts", \
                                "retweetcount", \
                                "language", \
                                "coordinates", \
                                "favorite_count", \
                                "is_retweet", \
                                "is_quote_status", \
                                "_c0", \
                                "userid", \
                                "tweetid", \
                                )

#Map Hashtags to users
hashtag_df = hashtag_df.withColumn("hashtags", extract_hashtags_udf(col("text")))

#Explode hastag list
hashtag_df = hashtag_df.select("username", F.explode("hashtags").alias("hashtag"))

#Count the number of each hashtag by user
grouped_hashtag_df = hashtag_df.groupBy("username", "hashtag").count()

#Calculate the max number of one hashtag for each user#
result_hashtag_df = grouped_hashtag_df.groupBy("username").agg(
    log(2.0, F.max("count")).alias("hashtags")
).orderBy("username")

hashtag_mean = result_hashtag_df.agg({"hashtags": "mean"}).collect()[0][0]
hashtag_stddev = result_hashtag_df.agg({"hashtags": "stddev"}).collect()[0][0]

result_hashtag_df = result_hashtag_df.withColumn("hashtags_zscore", (col("hashtags") - hashtag_mean) / hashtag_stddev)
result_hashtag_df = result_hashtag_df.withColumn("hashtags_normalized", ((least(col("hashtags_zscore"), constant_column) + 3) / 6)*100)
result_hashtag_df = result_hashtag_df.drop("hashtags_zscore", "hashtags")
#Display preview
# result_hashtag_df.display()

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import FloatType

#Drop unused columns
count_df = english_tweets.drop("acctdesc", \
                                "location", \
                                "following", \
                                "followers", \
                                "text", \
                                "usercreatedts", \
                                "tweetcreatedts", \
                                "retweetcount", \
                                "language", \
                                "coordinates", \
                                "favorite_count", \
                                "is_retweet", \
                                "is_quote_status", \
                                "_c0", \
                                "userid", \
                                "tweetid", \
                                "hashtags"
                                )

def calc_ratio(arr):
    count = arr[0]
    diff = arr[1]
    if diff:
        if count > diff:
            return 1.0
        else:
            return (count/diff)
    else:
        return 1.0

calc_ratio_udf = F.udf(calc_ratio, FloatType())

count_df1 = count_df.groupBy("username").count()

#Calculate the max number of one hashtag for each user
count_df2 = count_df.groupBy("username").agg(
    F.max("totaltweets")
).orderBy("username")

count_df3 = count_df.groupBy("username").agg(
    F.min("totaltweets")
).orderBy("username")

proportion_df = count_df1.join(count_df2,on="username",how="left")

range_df = proportion_df.join(count_df3,on="username",how="left")


range_df = range_df.withColumn("diff", col("max(totaltweets)")- col("min(totaltweets)"))
range_df = range_df.withColumn("ratio", calc_ratio_udf(F.array( col("count"),col("diff") ))*100)
range_df = range_df.drop("max(totaltweets)", "min(totaltweets)", "diff", "count")

# range_df.show()

In [0]:
# df_final_non_rt = avg_tweet_volume.join(avg_tweet_sentimentality, on="username", how="inner")
# df_final = df_final_non_rt.join(df_normalized_follow, on="username", how="left").join(df_quote_percentage, on="username", how="inner").join(df_account_age, on="username", how="inner").join(username_counts, on="username", how="inner")
df_final = avg_tweet_sentimentality_and_volume.join(df_normalized_follow, on="username", how="left").join(result_hashtag_df, on="username", how="inner").join(range_df, on="username", how="inner")
# display(df_final)

In [0]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans

selected_columns = [col for col in df_final.columns if col not in ['username', 'userid']]
assembler = VectorAssembler(inputCols=selected_columns, outputCol="features")
df_assembled = assembler.transform(df_final)

# Specify the number of clusters (k)
k = 8

# Create a KMeans instance
kmeans = KMeans().setK(k).setSeed(1)

# Fit the model to the assembled DataFrame
model = kmeans.fit(df_assembled)

# Get the cluster centers
centers = model.clusterCenters()

# Predict the cluster for each data point
predictions = model.transform(df_assembled)

# Display the results
print("Cluster Centers:")
for index, center in enumerate(centers):
    print(index, " ", center)

print("Predictions:")
# predictions.select("username", "features", "prediction").display()


Cluster Centers:
0   [13.93785496  6.36452676 30.95615617 57.94257142 50.32114511 50.90890801
 43.65106826  0.54141895 44.65799908 99.54696793]
1   [19.20533377 48.74568421 32.11143643 44.11587148 46.63950107 46.84978853
 45.55362103  0.9984337  46.29322616 99.05679845]
2   [ 29.85606252  25.38780931  37.39058825  64.68661733 200.
  39.19937924  53.35428681   2.64267172  53.58557445  89.19950815]
3   [ 20.91539457  28.2134293   73.42589508  55.4222461   47.76009057
  47.16463704 100.98406223  10.45328782 102.58907004  59.5698523 ]
4   [27.72668187 29.04193286 44.07683129 55.09824358 53.04171978 51.32763558
 67.17399522 17.81121027 62.50227556 16.319232  ]
5   [  9.14792589 103.54997706  30.29345087  58.49752908  51.61071281
  51.55101484  42.5957337    5.90546347  43.42954715  98.35421287]
6   [82.24014155  8.93129423 30.6880789  50.25952644 46.98643989 49.41534859
 43.18347648  1.7193316  43.84904493 99.02573493]
7   [33.43813285 28.09823371 30.93701412 52.76257383 55.25360506 53.7219