In [10]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, month, dayofmonth, hour
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Create a Spark session
spark = SparkSession.builder \
    .appName("Social Media Engagement Prediction") \
    .getOrCreate()

# Load data into a DataFrame
data_path = "sentimentdataset.csv"
df = spark.read.csv(data_path, header=True, inferSchema=True)

# Encode 'Sentiment' column to numeric values
indexer = StringIndexer(inputCol="Sentiment", outputCol="label")
df = indexer.fit(df).transform(df)

# Convert 'Timestamp' column to datetime and extract relevant features
df = df.withColumn("Timestamp", col("Timestamp").cast("timestamp"))
df = df.withColumn("Year", year("Timestamp"))
df = df.withColumn("Month", month("Timestamp"))
df = df.withColumn("Day", dayofmonth("Timestamp"))
df = df.withColumn("Hour", hour("Timestamp"))

# Drop unnecessary columns
drop_columns = ['Unnamed: 0', 'Text', 'Timestamp', 'User', 'Platform', 'Hashtags', 'Country']
df = df.drop(*drop_columns)

# Define features and label
features = ['Retweets', 'Likes']
label = 'label'

# Assemble features into a single vector column
assembler = VectorAssembler(inputCols=features, outputCol="features")
df = assembler.transform(df)

# Normalize features
scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withStd=True, withMean=False)
scalerModel = scaler.fit(df)
df = scalerModel.transform(df)

# Split data into train and test sets
(train_data, test_data) = df.randomSplit([0.8, 0.2])

# Initialize and train the linear regression model
lr = LinearRegression(featuresCol="scaled_features", labelCol=label)
lrModel = lr.fit(train_data)

# Make predictions
predictions = lrModel.transform(test_data)

# Evaluate the model
evaluator = RegressionEvaluator(labelCol=label, predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE) on test data = {rmse}")




Root Mean Squared Error (RMSE) on test data = 77.64171277633882


In [3]:
%pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=f0a6568daf10ee7fc45bae9c58cfe3109b1ee4466716aaef309e9cfe09918728
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1
