In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, desc, avg, count

# Initialize Spark Session
spark = SparkSession.builder.appName("SentimentAnalysis").getOrCreate()

# File path (update with the actual path of sentiment dataset)
file_path = "/content/SocialMediaSentimentAnalysiscsv.csv"

# Load data and clean
data = spark.read.csv(file_path, header=True, inferSchema=True).dropDuplicates().na.drop()

# Display schema
data.printSchema()

# Display data
print("Sample Data:")
data.show(5)

# 1. Top 10 Sentiments by Likes and Retweets
print("Top 10 Sentiments by Likes and Retweets:")
top_sentiments = data.groupBy("Sentiment") \
    .agg(avg(col("Likes")).alias("avg_likes"),
         avg(col("Retweets")).alias("avg_retweets")) \
    .orderBy(desc("avg_likes")).limit(10)

top_sentiments.show()

# 2. Sentiment Distribution by Platform
print("Sentiment Distribution by Platform:")
sentiment_platform = data.groupBy("Platform", "Sentiment") \
    .agg(count("*").alias("sentiment_count")) \
    .orderBy(desc("sentiment_count"))

sentiment_platform.show()

# 3. Average Likes and Retweets by Country
print("Average Likes and Retweets by Country:")
country_analysis = data.groupBy("Country") \
    .agg(avg(col("Likes")).alias("avg_likes"),
         avg(col("Retweets")).alias("avg_retweets")) \
    .orderBy(desc("avg_likes"))

country_analysis.show()

# 4. Sentiment Trends by Hour
print("Sentiment Trends by Hour:")
hourly_sentiment = data.groupBy("Hour", "Sentiment") \
    .agg(count("*").alias("sentiment_count")) \
    .orderBy("Hour")

hourly_sentiment.show()

# 5. Most Popular Platforms by Engagement (Likes + Retweets)
print("Most Popular Platforms by Engagement:")
platform_engagement = data.groupBy("Platform") \
    .agg(avg(col("Likes") + col("Retweets")).alias("avg_engagement")) \
    .orderBy(desc("avg_engagement"))

platform_engagement.show()

root
 |-- _c0: integer (nullable = true)
 |-- Unnamed: 0: integer (nullable = true)
 |-- Text: string (nullable = true)
 |-- Sentiment: string (nullable = true)
 |-- Timestamp: timestamp (nullable = true)
 |-- User: string (nullable = true)
 |-- Platform: string (nullable = true)
 |-- Hashtags: string (nullable = true)
 |-- Retweets: double (nullable = true)
 |-- Likes: double (nullable = true)
 |-- Country: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- Day: integer (nullable = true)
 |-- Hour: integer (nullable = true)

Sample Data:
+---+----------+--------------------+-----------------+-------------------+--------------------+-----------+--------------------+--------+-----+--------------+----+-----+---+----+
|_c0|Unnamed: 0|                Text|        Sentiment|          Timestamp|                User|   Platform|            Hashtags|Retweets|Likes|       Country|Year|Month|Day|Hour|
+---+----------+--------------------+----