In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import functions as F
#from textblob import TextBlob

In [5]:
spark = SparkSession.builder.appName("CSVtoTable").getOrCreate()

df = (spark.read.format("csv")
.option('mode','DROPMALFORMED')
.options(header = True, inferSchema = True, sep=",",multiLine=True)
.load("./../data/webscraped_WESTROCK_CO.csv")
.cache()) # Keep the dataframe in memory for faster processing 

In [6]:
#Cleaning des données

#Dates
df = df.na.fill(0, subset=['ReplyCount','RetweetCount','LikeCount'])
#Fillna de reply, retweet et like en à

df = df.select(*[col(c).cast("integer").alias(c) if c in ['ReplyCount','RetweetCount','LikeCount'] else c for c in df.columns])
df = df.withColumn("PostDate", to_date(df["PostDate"]))

#Cleaning du texte
df = df.withColumn("TweetText", trim(regexp_replace("TweetText", "[\n]+", " ")))
df = df.withColumn("TweetText", trim(regexp_replace("TweetText", "https?://[^ ]+", "")))
df = df.withColumn("TweetText", trim(regexp_replace("TweetText", "@[^ ]+", "")))
df = df.withColumn("TweetText", trim(regexp_replace("TweetText", "#[^ ]+", "")))
df = df.withColumn("TweetText", trim(regexp_replace("TweetText", "[^A-Za-z0-9 ]", "")))
df = df.withColumn("TweetText", regexp_extract("TweetText", "(2021|2017|2018|2019|2020|2022).*", 0))
df = df.withColumn("TweetText", regexp_replace("TweetText", "[^a-zA-Z\\s]", ""))
df = df.na.drop(subset="TweetText") 

#On enlève le @ du handle
df = df.withColumn("Handle", trim(regexp_replace("Handle", "@", "")))

In [7]:
""" 
A ajouter en spark : 

# Repeating words like hurrrryyyyyy
rpt_regex = re.compile(r"(.)\1{1,}", re.IGNORECASE);
def rpt_repl(match):
	return match.group(1)+match.group(1)

def remove_special_chars(self, tweets):  # it unrolls the hashtags to normal words
	for remove in map(lambda r: regex.compile(regex.escape(r)), [",", ":", "\"", "=", "&", ";", "%", "$",
																	"@", "%", "^", "*", "(", ")", "{", "}",
																	"[", "]", "|", "/", "\\", ">", "<", "-",
																	"!", "?", ".", "'",
																	"--", "---", "#"]):
		tweets.loc[:, "text"].replace(remove, "", inplace=True)
	return tweets


"""
    

' \nA ajouter en spark : \n\n# Repeating words like hurrrryyyyyy\nrpt_regex = re.compile(r"(.)\x01{1,}", re.IGNORECASE);\ndef rpt_repl(match):\n\treturn match.group(1)+match.group(1)\n\ndef remove_special_chars(self, tweets):  # it unrolls the hashtags to normal words\n\tfor remove in map(lambda r: regex.compile(regex.escape(r)), [",", ":", """, "=", "&", ";", "%", "$",\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"@", "%", "^", "*", "(", ")", "{", "}",\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"[", "]", "|", "/", "\\", ">", "<", "-",\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"!", "?", ".", "\'",\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"--", "---", "#"]):\n\t\ttweets.loc[:, "text"].replace(remove, "", inplace=True)\n\treturn tweets\n\n\n'

In [8]:
df.printSchema()


root
 |-- User: string (nullable = true)
 |-- Handle: string (nullable = true)
 |-- PostDate: date (nullable = true)
 |-- TweetText: string (nullable = true)
 |-- ReplyCount: integer (nullable = true)
 |-- RetweetCount: integer (nullable = true)
 |-- LikeCount: integer (nullable = true)



In [9]:
df.dtypes


[('User', 'string'),
 ('Handle', 'string'),
 ('PostDate', 'date'),
 ('TweetText', 'string'),
 ('ReplyCount', 'int'),
 ('RetweetCount', 'int'),
 ('LikeCount', 'int')]

In [10]:
df.count()

4

In [11]:
df.show()

+----------------+--------------+----------+--------------------+----------+------------+---------+
|            User|        Handle|  PostDate|           TweetText|ReplyCount|RetweetCount|LikeCount|
+----------------+--------------+----------+--------------------+----------+------------+---------+
|   Kwhen Finance|  kwhenfinance|2022-09-24|WestRock Co Share...|         0|        null|        0|
|True Market News|TrueMarketNews|2022-09-22|WestRock Co WRK W...|         1|        null|        2|
|   Kwhen Finance|  kwhenfinance|2022-09-22|WestRock Co Share...|         0|        null|        0|
|   Investor News|  newsfilterio|2022-09-22|WEST Stephens  Co...|         0|        null|        0|
+----------------+--------------+----------+--------------------+----------+------------+---------+



In [12]:
df.groupBy("ReplyCount").count().show()

+----------+-----+
|ReplyCount|count|
+----------+-----+
|         1|    1|
|         0|    3|
+----------+-----+



In [13]:
df.groupBy("RetweetCount").count().show()

+------------+-----+
|RetweetCount|count|
+------------+-----+
|        null|    4|
+------------+-----+



In [14]:
df.groupBy("LikeCount").count().show()
#ponderer le poids du sentiment par rapport au nombre de like

+---------+-----+
|LikeCount|count|
+---------+-----+
|        2|    1|
|        0|    3|
+---------+-----+

