In [1]:
# Importiamo le librerie necessarie
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType
from pyspark.sql.functions import col, year, sum
from pyspark.sql.dataframe import DataFrame
import numpy as np

# Supponiamo che SparkSession sia già stato creato
ss = SparkSession.builder.appName("PoliSalesAnalysis").getOrCreate()

# Variabili per i percorsi di input e output
# Percorsi dei file di input e output
jupyter = False
if jupyter:
    input_prefix = "/user/s339450/esami/20240912/"
    output_prefix= "/user/s339450/esami/20240912/out/"
else:
    input_prefix = ".\\data\\"
    output_prefix= ".\\out\\"

job_contracts_path = f"{input_prefix}JobContracts.txt"
job_offers_path = f"{input_prefix}JobOffers.txt"
job_postings_path = f"{input_prefix}JobPostings.txt"
output_folder_1 = f"{output_prefix}1/"
output_folder_2 = f"{output_prefix}2/"

In [2]:
job_contracts_schema = StructType([
    StructField("ContractID", IntegerType(), False),
    StructField("OfferId", IntegerType(), False),
    StructField("ContractDate", DateType(), False),
    StructField("ContractType", StringType(), False),
])

job_contracts: DataFrame = ss.read.load(job_contracts_path,
    format="csv",
    header=False,
    schema=job_contracts_schema)

job_contracts.show()

job_offers_schema = StructType([
    StructField("OfferID", IntegerType(), False),
    StructField("JobId", IntegerType(), False),
    StructField("Salary", IntegerType(), False),
    StructField("Status", StringType(), False),
    StructField("SSN", StringType(), False)
])

job_offers: DataFrame = ss.read.load(job_offers_path,
    format="csv",
    header=False,
    schema=job_offers_schema)

job_offers.show()

job_postings_schema = StructType([
    StructField("JobID", IntegerType(), False),
    StructField("Title", StringType(), False),
    StructField("Country", StringType(), False)
])

job_postings: DataFrame = ss.read.load(job_postings_path,
    format="csv",
    header=False,
    schema=job_postings_schema)

job_postings.show()

+----------+-------+------------+------------+
|ContractID|OfferId|ContractDate|ContractType|
+----------+-------+------------+------------+
|       201|    101|  2023-01-15|   Full-time|
|       202|    103|  2023-02-01|   Part-time|
|       203|    114|  2023-03-10|   Full-time|
|       204|    115|  2023-04-20|  Internship|
|       205|    116|  2023-05-05|   Full-time|
|       206|    117|  2023-06-15|  Contractor|
+----------+-------+------------+------------+

+-------+-----+------+--------+-----------+
|OfferID|JobId|Salary|  Status|        SSN|
+-------+-----+------+--------+-----------+
|    101|    1| 97000|Accepted|800-11-2222|
|    102|    2|120000|Rejected|800-11-2223|
|    103|    3|120000|Accepted|800-12-2222|
|    104|    5|120000|Rejected|801-21-3222|
|    105|    5|120000|Rejected|800-41-2232|
|    106|    4|120000|Rejected|800-14-2422|
|    107|    3|120000|Rejected|800-17-2252|
|    114|    5|120000|Accepted|800-51-2222|
|    115|    1|120000|Accepted|800-51-2622|
|

# Punto 1

In [17]:
from pyspark.sql.functions import desc, asc, avg, count, expr, max

result1 = (
    job_offers
    .filter(col("Status") == "Accepted")
    .join(job_postings, "JobId")
    .groupBy(col("OfferId"))
    .agg(avg("Salary").alias("AverageSalary"))  # Assegna un alias a avg(Salary)
    .sort(desc("AverageSalary"))  # Ordina in base alla colonna con l'alias
)

result1.show(3)

+-------+-------------+
|OfferId|AverageSalary|
+-------+-------------+
|    115|     120000.0|
|    103|     120000.0|
|    117|     120000.0|
+-------+-------------+
only showing top 3 rows



# Punto 2

In [23]:
from pyspark.sql import Window

tmp = (
    job_offers
    .filter(col("Status") == "Accepted")
    .join(job_postings, "JobId")
    .groupBy(col("Title"), col("Country"))
    .agg(count(expr("*")))
    
)
tmp.show()

tmp_with_max = tmp.withColumn("max_count", max("count(1)").over(Window.partitionBy("Country")))
tmp_with_max.show()

result = tmp_with_max.filter(col("max_count") == col("count(1)"))
result.show()

+-----------------+-------+--------+
|            Title|Country|count(1)|
+-----------------+-------+--------+
|Software Engineer|     IT|       2|
|Software Engineer|     ES|       1|
|   Data Scientist|     IT|       3|
+-----------------+-------+--------+

+-----------------+-------+--------+---------+
|            Title|Country|count(1)|max_count|
+-----------------+-------+--------+---------+
|Software Engineer|     ES|       1|        1|
|Software Engineer|     IT|       2|        3|
|   Data Scientist|     IT|       3|        3|
+-----------------+-------+--------+---------+

+-----------------+-------+--------+---------+
|            Title|Country|count(1)|max_count|
+-----------------+-------+--------+---------+
|Software Engineer|     ES|       1|        1|
|   Data Scientist|     IT|       3|        3|
+-----------------+-------+--------+---------+



: 