In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import avg, year
from pyspark.sql.functions import desc
import matplotlib.pyplot as plt
spark = SparkSession.builder.appName("GDPAnalysis").getOrCreate()
df = spark.read.csv("/covid.csv", header=True, inferSchema=True)
df = df.dropDuplicates().na.drop()
df = df.withColumn("year", year(df.date))
avg_gdp_df = df.groupBy("location", "year").agg(avg("gdp_per_capita").alias("avg_gdp_per_capita"))
df_with_avg_gdp = df.join(avg_gdp_df, ["location", "year"])
selected_columns = ["location", "year", "avg_gdp_per_capita", "total_cases", "total_deaths", "stringency_index", "population", "human_development_index"]
assembler = VectorAssembler(inputCols=selected_columns[3:], outputCol="features")
df_transformed = assembler.transform(df_with_avg_gdp)
(trainingData, testData) = df_transformed.randomSplit([0.8, 0.2])
lr = LinearRegression(featuresCol="features", labelCol="avg_gdp_per_capita")
model = lr.fit(trainingData)
predictions = model.transform(testData)
evaluator = RegressionEvaluator(labelCol="avg_gdp_per_capita", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Square Error (RMSE):", rmse)
top_10_countries = df_with_avg_gdp.select("location", "avg_gdp_per_capita") \
    .dropDuplicates(["location"]) \
    .orderBy("avg_gdp_per_capita", ascending=True) \
    .limit(10)

top_10_countries_pd = top_10_countries.toPandas()
plt.figure(figsize=(10, 6))
plt.bar(top_10_countries_pd["location"], top_10_countries_pd["avg_gdp_per_capita"])
plt.xlabel("Country")
plt.ylabel("Average GDP per Capita")
plt.title("Top 10 Unique Countries with Highest Average GDP per Capita")
plt.xticks(rotation=45)
plt.show()