### Labolatorium 1

1. Utworzenie SparkSession i prostego DataFrame

In [16]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, desc, rank
from pyspark.sql.window import Window

import os
import sys
current_python = sys.executable
os.environ['PYSPARK_PYTHON'] = current_python
os.environ['PYSPARK_DRIVER_PYTHON'] = current_python
os.environ['HADOOP_HOME'] = "C:\\hadoop"
sys.path.append("C:\\hadoop\\bin")

spark = SparkSession.builder.appName("Lab1").getOrCreate()
data = [{"imie": "Adam", "wiek": 30, "dzial": "Finanse"},
        {"imie": "Ewa", "wiek": 26, "dzial": "IT"},
        {"imie": "Damian", "wiek": 25, "dzial": "PR"},
        {"imie": "Mariusz", "wiek": 35, "dzial": "IT"}]
df = spark.createDataFrame(data)
df.show()


+-------+-------+----+
|  dzial|   imie|wiek|
+-------+-------+----+
|Finanse|   Adam|  30|
|     IT|    Ewa|  26|
|     PR| Damian|  25|
|     IT|Mariusz|  35|
+-------+-------+----+



2. Wczytanie danych z pliku CSV

In [17]:
df_csv = spark.read.csv("pracownicy.csv", header=True, inferSchema=True)
df_csv.show(5)

+-----+---+----------+
| name|age|department|
+-----+---+----------+
|  Jan| 28|        IT|
| Anna| 34|        HR|
|Piotr| 22|   Finance|
|  Ewa| 45|        IT|
|Marek| 31|        HR|
+-----+---+----------+
only showing top 5 rows



3. Filtrowanie i wybór kolumn

In [18]:
df_filtered = df_csv.select("name", "age").filter(df_csv.age > 30)
df_filtered.show()

+-----+---+
| name|age|
+-----+---+
| Anna| 34|
|  Ewa| 45|
|Marek| 31|
+-----+---+



4. Sortowanie i agregacje

In [20]:
df_sorted = df_csv.orderBy(col("age").desc())
df_sorted.show()
df_grouped = df_csv.groupBy("department").agg(avg("age").alias("avg_age"))
df_grouped.show()

+------+---+----------+
|  name|age|department|
+------+---+----------+
|   Ewa| 45|        IT|
|  Anna| 34|        HR|
| Marek| 31|        HR|
|Joanna| 29|   Finance|
|   Jan| 28|        IT|
| Piotr| 22|   Finance|
+------+---+----------+

+----------+-------+
|department|avg_age|
+----------+-------+
|        HR|   32.5|
|   Finance|   25.5|
|        IT|   36.5|
+----------+-------+



5. Dodanie i usunięcie kolumn

In [21]:
df_plus_5 = df_csv.withColumn("age_in_5_years", col("age") + 5)
df_plus_5.show()

df_no_dept = df_plus_5.drop("department")
df_no_dept.show()

+------+---+----------+--------------+
|  name|age|department|age_in_5_years|
+------+---+----------+--------------+
|   Jan| 28|        IT|            33|
|  Anna| 34|        HR|            39|
| Piotr| 22|   Finance|            27|
|   Ewa| 45|        IT|            50|
| Marek| 31|        HR|            36|
|Joanna| 29|   Finance|            34|
+------+---+----------+--------------+

+------+---+--------------+
|  name|age|age_in_5_years|
+------+---+--------------+
|   Jan| 28|            33|
|  Anna| 34|            39|
| Piotr| 22|            27|
|   Ewa| 45|            50|
| Marek| 31|            36|
|Joanna| 29|            34|
+------+---+--------------+



6. Łączenie dwóch DataFrame

In [23]:
df_csv_projects = spark.read.csv("projekty.csv", header=True, inferSchema=True)
df_csv_projects.show()

df_joined = df_csv.join(df_csv_projects, on="name", how="inner")
df_joined.show()

+------+--------+
|  name|projects|
+------+--------+
|   Jan|       5|
|  Anna|       3|
| Piotr|       4|
|   Ewa|       7|
| Marek|       2|
|Joanna|       6|
+------+--------+

+------+---+----------+--------+
|  name|age|department|projects|
+------+---+----------+--------+
|   Jan| 28|        IT|       5|
|  Anna| 34|        HR|       3|
| Piotr| 22|   Finance|       4|
|   Ewa| 45|        IT|       7|
| Marek| 31|        HR|       2|
|Joanna| 29|   Finance|       6|
+------+---+----------+--------+



7. Obsługa brakujących wartości

In [26]:
data_nulls = [
    ("Kamil", None, "IT"),
    (None, 25, "HR"),
    ("Basia", 30, None)
]
df_with_nulls = spark.createDataFrame(data_nulls, ["name", "age", "department"])
df_with_nulls.show()

df_filled = df_with_nulls.na.fill({
    "age": 0, 
    "name": "Brak", 
    "department": "Brak"
})


df_filled.show()
df_dropped_nulls = df_with_nulls.na.drop()

+-----+----+----------+
| name| age|department|
+-----+----+----------+
|Kamil|NULL|        IT|
| NULL|  25|        HR|
|Basia|  30|      NULL|
+-----+----+----------+

+-----+---+----------+
| name|age|department|
+-----+---+----------+
|Kamil|  0|        IT|
| Brak| 25|        HR|
|Basia| 30|      Brak|
+-----+---+----------+



8. Operacje na oknach (Window Functions)

In [28]:
windowSpec = Window.orderBy(col("projects").desc())
df_ranked = df_joined.withColumn("rank", rank().over(windowSpec))
df_ranked.show()

+------+---+----------+--------+----+
|  name|age|department|projects|rank|
+------+---+----------+--------+----+
|   Ewa| 45|        IT|       7|   1|
|Joanna| 29|   Finance|       6|   2|
|   Jan| 28|        IT|       5|   3|
| Piotr| 22|   Finance|       4|   4|
|  Anna| 34|        HR|       3|   5|
| Marek| 31|        HR|       2|   6|
+------+---+----------+--------+----+



9. Operacje na RDD - klasyczne liczenie słów

In [36]:
rdd = spark.sparkContext.textFile("tekst_do_liczenia.txt")

counts = rdd.flatMap(lambda line: line.split(" ")) \
            .map(lambda word: (word, 1)) \
            .reduceByKey(lambda a, b: a + b) \
            .takeOrdered(10, key=lambda x: -x[1])

print(counts)

[('To', 3), ('jest', 2), ('do', 2), ('jak', 2), ('tekst', 2), ('przykładowy', 1), ('liczenia', 1), ('słów.', 1), ('ćwiczenie', 1), ('działa', 1)]


10. Zapisywanie wyników do pliku

In [37]:
df_ranked.write.csv("wyniki.csv", header=True)
df_ranked.write.json("wyniki.json")