In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
from IPython.display import display, HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [3]:
spark = SparkSession.builder \
    .appName("Gold Arch") \
    .config("spark.sql.warehouse.dir", "/user/hive/warehouse") \
    .config("hive.metastore.uris", "thrift://localhost:9083") \
    .enableHiveSupport() \
    .getOrCreate()

25/04/06 00:22:42 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [4]:
path ="hdfs:///data/sliver/Fact_Table_parquet"
df = spark.read.parquet(path)

                                                                                

In [5]:
df.printSchema()

root
 |-- Edition ID: integer (nullable = true)
 |-- Country NOC: string (nullable = true)
 |-- Result ID: integer (nullable = true)
 |-- Athlete ID: integer (nullable = true)
 |-- Position: string (nullable = true)
 |-- Medal Athlete: string (nullable = true)
 |-- Gold Medals: integer (nullable = true)
 |-- Silver Medals: integer (nullable = true)
 |-- Bronze Medals: integer (nullable = true)
 |-- Total Medals: integer (nullable = true)



In [13]:
df.show(5)

[Stage 1:>                                                          (0 + 1) / 1]

+----------+-----------+---------+----------+--------+-------------+-----------+-------------+-------------+------------+
|Edition ID|Country NOC|Result ID|Athlete ID|Position|Medal Athlete|Gold Medals|Silver Medals|Bronze Medals|Total Medals|
+----------+-----------+---------+----------+--------+-------------+-----------+-------------+-------------+------------+
|        23|        KOR|    40745|      1971|       2|       Silver|         12|            5|           12|          29|
|        23|        KOR|    40745|      1972|      15|     No Medal|         12|            5|           12|          29|
|        23|        KOR|    40745|      1977|      20|     No Medal|         12|            5|           12|          29|
|        23|        KOR|    40788|      1972|       5|     No Medal|         12|            5|           12|          29|
|        23|        KOR|    40788|      1971|       5|     No Medal|         12|            5|           12|          29|
+----------+-----------+

                                                                                

In [7]:
df_unique_results = df.dropDuplicates(["Country NOC", "Edition ID"])

df_medals = df_unique_results.groupBy("Country NOC", "Edition ID").agg(
    sum("Gold Medals").alias("total_gold"),
    sum("Silver Medals").alias("total_silver"),
    sum("Bronze Medals").alias("total_bronze")
)
df_medals = df_medals.withColumn("total_medals", expr("total_gold + total_silver + total_bronze"))

# عرض النتائج
df_medals.show()

# حفظ النتائج في HDFS (الطبقة الذهبية)
# df_medals.write.mode("overwrite").parquet("hdfs://path_to_gold/medals_per_country_edition")

# customer_churn90.createOrReplaceTempView("customer_churn90")



[Stage 2:>                                                          (0 + 4) / 4]

+-----------+----------+----------+------------+------------+------------+
|Country NOC|Edition ID|total_gold|total_silver|total_bronze|total_medals|
+-----------+----------+----------+------------+------------+------------+
|        NED|        40|         1|           2|           3|           6|
|        FRA|        41|         0|           0|           1|           1|
|        URS|        17|        29|          32|          30|          91|
|        ROU|        11|         0|           1|           0|           1|
|        GBR|        29|         1|           1|           2|           4|
|        ITA|        18|         5|           3|          10|          18|
|        ITA|        22|         6|           4|           4|          14|
|        FRG|        39|         3|           1|           1|           5|
|        CHN|        59|        26|          18|          26|          70|
|        GBR|        37|         1|           0|           0|           1|
|        FRA|         9| 

                                                                                

In [8]:
df_medals.count()

1785

In [9]:
df_medals.printSchema()

root
 |-- Country NOC: string (nullable = true)
 |-- Edition ID: integer (nullable = true)
 |-- total_gold: long (nullable = true)
 |-- total_silver: long (nullable = true)
 |-- total_bronze: long (nullable = true)
 |-- total_medals: long (nullable = true)



In [10]:
path ="hdfs:///data/sliver/Athlete_parquet"
df_bio = spark.read.parquet(path)

In [11]:
df_bio.printSchema()

root
 |-- Athlete ID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: integer (nullable = true)
 |-- Born: date (nullable = true)
 |-- Height: integer (nullable = true)
 |-- Weight: integer (nullable = true)
 |-- Nationality: string (nullable = true)



In [12]:
df_bio = df_bio.select(
    col("Athlete ID").alias("bio_athlete_id"),
    "Name",
    "Sex",
    "Born",
    "Height",
    "Weight",
    "Nationality"
)

In [13]:
df_athletes_with_medals = df.join(df_bio, df["Athlete ID"] == df_bio["bio_athlete_id"], "left_outer") 

In [14]:
df_athletes_with_medals.show(2)

                                                                                

+----------+-----------+---------+----------+--------+-------------+-----------+-------------+-------------+------------+--------------+--------------+---+----------+------+------+------------------+
|Edition ID|Country NOC|Result ID|Athlete ID|Position|Medal Athlete|Gold Medals|Silver Medals|Bronze Medals|Total Medals|bio_athlete_id|          Name|Sex|      Born|Height|Weight|       Nationality|
+----------+-----------+---------+----------+--------+-------------+-----------+-------------+-------------+------------+--------------+--------------+---+----------+------+------+------------------+
|        23|        KOR|    40745|      1971|       2|       Silver|         12|            5|           12|          29|          1971|Jeong Jae-Heon|  1|1974-06-01|   176|    71| Republic of Korea|
|        23|        KOR|    40745|      1972|      15|     No Medal|         12|            5|           12|          29|          1972| Han Seung-Hun|  1|1973-06-11|   171|    62| Republic of Korea|


In [15]:
df_stats =df_athletes_with_medals.groupBy("Country NOC", "Edition ID").agg(
    count("Athlete ID").alias("total_athletes"),  # عدد الرياضيين
    avg("Height").alias("avg_height"),  # متوسط الطول
    avg("Weight").alias("avg_weight"),  # متوسط الوزن
    sum(when(df_bio["Sex"] == 1, 1).otherwise(0)).alias("male_athletes"), 
    sum(when(df_bio["Sex"] == 0, 1).otherwise(0)).alias("female_athletes")
)
df_stats.show()



+-----------+----------+--------------+------------------+-----------------+-------------+---------------+
|Country NOC|Edition ID|total_athletes|        avg_height|       avg_weight|male_athletes|female_athletes|
+-----------+----------+--------------+------------------+-----------------+-------------+---------------+
|        NED|        40|            26|178.26923076923077|73.15384615384616|           13|             13|
|        FRA|        41|            44|171.52272727272728|64.95454545454545|           34|             10|
|        URS|        17|           501| 175.3692614770459|72.00998003992017|          368|            133|
|        ROU|        11|           128|         176.03125|           71.125|          126|              2|
|        GBR|        29|            87|176.73684210526315|71.63157894736842|           72|              4|
|        ITA|        18|           367| 173.9891008174387|69.21525885558583|          292|             75|
|        ITA|        22|           38

                                                                                

In [16]:
df_stats = df_stats.select(
    col("Country NOC").alias("NOC"),
    col("Edition ID").alias("edition_id"),
    "total_athletes",
    "avg_height",
    "avg_weight",
    "male_athletes",
    "female_athletes"
)

In [17]:
df_final = df_stats.join(df_medals, (df_stats["edition_id"] == df_medals["Edition ID"]) & (df_stats["NOC"] == df_medals["Country NOC"])) 

In [18]:
df_final.printSchema()

root
 |-- NOC: string (nullable = true)
 |-- edition_id: integer (nullable = true)
 |-- total_athletes: long (nullable = false)
 |-- avg_height: double (nullable = true)
 |-- avg_weight: double (nullable = true)
 |-- male_athletes: long (nullable = true)
 |-- female_athletes: long (nullable = true)
 |-- Country NOC: string (nullable = true)
 |-- Edition ID: integer (nullable = true)
 |-- total_gold: long (nullable = true)
 |-- total_silver: long (nullable = true)
 |-- total_bronze: long (nullable = true)
 |-- total_medals: long (nullable = true)



In [26]:
df_final = df_final.select(
    col("Country NOC").alias("Country NOC"),
    col("edition_id").alias("Edition ID"),
    col("total_athletes").alias("Total Athletes"),
    col("avg_height").alias("Average Height"),
    col("avg_weight").alias("Average Weight"),
    col("male_athletes").alias("Male Athletes"),
    col("female_athletes").alias("Female Athletes"),
    col("total_gold").alias("Total Gold"),
    col("total_silver").alias("Total Silver"),
    col("total_bronze").alias("Total Bronze"),
    col("total_medals").alias("Total Medals")
)

df_final.show(5)

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `edition_id` cannot be resolved. Did you mean one of the following? [`Edition ID`, `Total Gold`, `Total Bronze`, `Country NOC`, `Total Medals`].;
'Project [Country NOC#458 AS Country NOC#776, 'edition_id AS Edition ID#777, 'total_athletes AS Total Athletes#778, 'avg_height AS Average Height#779, 'avg_weight AS Average Weight#780, 'male_athletes AS Male Athletes#781, 'female_athletes AS Female Athletes#782, 'total_gold AS Total Gold#783, 'total_silver AS Total Silver#784, 'total_bronze AS Total Bronze#785, 'total_medals AS Total Medals#786]
+- Project [Country NOC#422 AS Country NOC#458, edition_id#413 AS Edition ID#459, total_athletes#332L AS Total Athletes#460L, avg_height#334 AS Average Height#461, avg_weight#336 AS Average Weight#462, male_athletes#338L AS Male Athletes#463L, female_athletes#340L AS Female Athletes#464L, total_gold#73L AS Total Gold#465L, total_silver#75L AS Total Silver#466L, total_bronze#77L AS Total Bronze#467L, total_medals#83L AS Total Medals#468L]
   +- Join Inner, ((edition_id#413 = Edition ID#421) AND (NOC#412 = Country NOC#422))
      :- Project [Country NOC#1 AS NOC#412, Edition ID#0 AS edition_id#413, total_athletes#332L, avg_height#334, avg_weight#336, male_athletes#338L, female_athletes#340L]
      :  +- Aggregate [Country NOC#1, Edition ID#0], [Country NOC#1, Edition ID#0, count(Athlete ID#3) AS total_athletes#332L, avg(Height#192) AS avg_height#334, avg(Weight#193) AS avg_weight#336, sum(CASE WHEN (Sex#190 = 1) THEN 1 ELSE 0 END) AS male_athletes#338L, sum(CASE WHEN (Sex#190 = 0) THEN 1 ELSE 0 END) AS female_athletes#340L]
      :     +- Join LeftOuter, (Athlete ID#3 = bio_athlete_id#202)
      :        :- Relation [Edition ID#0,Country NOC#1,Result ID#2,Athlete ID#3,Position#4,Medal Athlete#5,Gold Medals#6,Silver Medals#7,Bronze Medals#8,Total Medals#9] parquet
      :        +- Project [Athlete ID#188 AS bio_athlete_id#202, Name#189, Sex#190, Born#191, Height#192, Weight#193, Nationality#194]
      :           +- Relation [Athlete ID#188,Name#189,Sex#190,Born#191,Height#192,Weight#193,Nationality#194] parquet
      +- Project [Country NOC#422, Edition ID#421, total_gold#73L, total_silver#75L, total_bronze#77L, ((total_gold#73L + total_silver#75L) + total_bronze#77L) AS total_medals#83L]
         +- Aggregate [Country NOC#422, Edition ID#421], [Country NOC#422, Edition ID#421, sum(Gold Medals#427) AS total_gold#73L, sum(Silver Medals#428) AS total_silver#75L, sum(Bronze Medals#429) AS total_bronze#77L]
            +- Deduplicate [Country NOC#422, Edition ID#421]
               +- Relation [Edition ID#421,Country NOC#422,Result ID#423,Athlete ID#424,Position#425,Medal Athlete#426,Gold Medals#427,Silver Medals#428,Bronze Medals#429,Total Medals#430] parquet


In [20]:
df_final.write.mode("overwrite").parquet("hdfs:///data/gold/AnalysisCountryGames_Parquet")


                                                                                

In [23]:
# spark.sql("SET spark.sql.catalogImplementation=hive")
spark.sql("show databases").show()

+---------+
|namespace|
+---------+
|  default|
+---------+



In [6]:
df.write.mode("overwrite").saveAsTable("AnalysisCountryGames_Table")

AnalysisException: org.apache.hadoop.hive.ql.metadata.HiveException: Unable to fetch table analysiscountrygames_table. Invalid method name: 'get_table'

In [5]:
# Test the connection by listing databases
spark.sql("SHOW DATABASES").show()

25/04/06 00:23:09 WARN HiveConf: HiveConf of name hive.metastore.db.type does not exist


+---------+
|namespace|
+---------+
|  default|
+---------+



In [9]:
spark.sql("show namespaces;").show()

+---------+
|namespace|
+---------+
|  default|
|     gold|
+---------+



In [8]:
spark.sql("create database gold").show()

++
||
++
++



In [10]:
spark.sql("use gold")

DataFrame[]

In [12]:
df.write.format("hive").mode("overwrite").saveAsTable("gold.product_trends")

AnalysisException: org.apache.hadoop.hive.ql.metadata.HiveException: Unable to fetch table product_trends. Invalid method name: 'get_table'