In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, regexp_replace, trim, lower, upper, mean, median, count, lit, split, concat_ws, year, month, dayofweek
import pandas as pd
import numpy as np
import findspark

In [2]:
findspark.init()

In [3]:
spark = SparkSession.builder.appName("PreprocessingFeatureEngineering").getOrCreate()

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 63582)
Traceback (most recent call last):
  File "C:\Users\Robyi\AppData\Local\Programs\Python\Python311\Lib\socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "C:\Users\Robyi\AppData\Local\Programs\Python\Python311\Lib\socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "C:\Users\Robyi\AppData\Local\Programs\Python\Python311\Lib\socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "C:\Users\Robyi\AppData\Local\Programs\Python\Python311\Lib\socketserver.py", line 755, in __init__
    self.handle()
  File "C:\Users\Robyi\AppData\Local\Programs\Python\Python311\Lib\site-packages\pyspark\accumulators.py", line 295, in handle
    poll(accum_updates)
  File "C:\Users\Robyi\AppData\Local\Programs\Python\Python311\Lib

In [4]:
spark

In [6]:
data = [
    ("John Doe", 25, "New York", 50000, "Male", "2024-01-15", "yes", 3, 0.5, 1.5),
    ("Jane Smith", 30, "Los Angeles", None, "Female", "2023-05-20", "no", 5, None, 3.1),
    (None, None, "San Francisco", 70000, "Other", None, "no", 2, 1.2, 2.0),
    ("Michael Johnson", 40, None, 80000, "Male", "2022-08-10", None, None, 0.8, None),
    ("Sara Connor", 50, "Chicago", 90000, "Female", "2019-12-25", "yes", 7, 2.5, 4.0),
    ("John Doe", 25, "New York", 50000, "Male", "2024-01-15", "yes", 3, 0.5, 1.5),  # Duplikat
]

columns = ["name", "age", "city", "salary", "gender", "date", "response", "experience", "rating", "score"]

df = spark.createDataFrame(data, columns)

In [7]:
df.show()

+---------------+----+-------------+------+------+----------+--------+----------+------+-----+
|           name| age|         city|salary|gender|      date|response|experience|rating|score|
+---------------+----+-------------+------+------+----------+--------+----------+------+-----+
|       John Doe|  25|     New York| 50000|  Male|2024-01-15|     yes|         3|   0.5|  1.5|
|     Jane Smith|  30|  Los Angeles|  null|Female|2023-05-20|      no|         5|  null|  3.1|
|           null|null|San Francisco| 70000| Other|      null|      no|         2|   1.2|  2.0|
|Michael Johnson|  40|         null| 80000|  Male|2022-08-10|    null|      null|   0.8| null|
|    Sara Connor|  50|      Chicago| 90000|Female|2019-12-25|     yes|         7|   2.5|  4.0|
|       John Doe|  25|     New York| 50000|  Male|2024-01-15|     yes|         3|   0.5|  1.5|
+---------------+----+-------------+------+------+----------+--------+----------+------+-----+



In [8]:
df = df.fillna({"name": "Unknown", "city": "Unknown", "gender": "Other", "response": "Unknown"})

In [10]:
pandas_df = df.collect()
columns = ["name", "age", "city", "salary", "gender", "date", "response", "experience","rating","score"]
df_pandas = pd.DataFrame(pandas_df, columns=columns)

In [11]:
df_pandas["age"].fillna(df_pandas["age"].median(), inplace=True)
df_pandas["salary"].fillna(df_pandas["salary"].mean(), inplace=True)
df_pandas["experience"].fillna(df_pandas["experience"].median(), inplace=True)
df_pandas["rating"].fillna(df_pandas["rating"].mean(), inplace=True)
df_pandas["score"].fillna(df_pandas["score"].median(), inplace=True)

In [12]:
df = spark.createDataFrame(df_pandas) 

In [13]:
df = df.fillna({"date": "2000-01-01"})
df = df.withColumn("date", col("date").cast("date"))

In [14]:
from pyspark.ml.feature import StringIndexer

In [15]:
indexer_gender = StringIndexer(inputCol="gender", outputCol="gender_index")
indexer_response = StringIndexer(inputCol="response", outputCol="response_index")

df = indexer_gender.fit(df).transform(df)
df = indexer_response.fit(df).transform(df)

In [16]:
df.show()

+---------------+----+-------------+-------+------+----------+--------+----------+------+-----+------------+--------------+
|           name| age|         city| salary|gender|      date|response|experience|rating|score|gender_index|response_index|
+---------------+----+-------------+-------+------+----------+--------+----------+------+-----+------------+--------------+
|       John Doe|25.0|     New York|50000.0|  Male|2024-01-15|     yes|       3.0|   0.5|  1.5|         0.0|           0.0|
|     Jane Smith|30.0|  Los Angeles|68000.0|Female|2023-05-20|      no|       5.0|   1.1|  3.1|         1.0|           1.0|
|        Unknown|30.0|San Francisco|70000.0| Other|2000-01-01|      no|       2.0|   1.2|  2.0|         2.0|           1.0|
|Michael Johnson|40.0|      Unknown|80000.0|  Male|2022-08-10| Unknown|       3.0|   0.8|  2.0|         0.0|           2.0|
|    Sara Connor|50.0|      Chicago|90000.0|Female|2019-12-25|     yes|       7.0|   2.5|  4.0|         1.0|           0.0|
|       

In [17]:
pandas_df = df.collect()
columns = ["name", "age", "city", "salary", "gender", "date", "response", "experience","rating","score","gender_index","response_index"]
df_pandas = pd.DataFrame(pandas_df, columns=columns)

In [18]:
df_pandas = pd.get_dummies(df_pandas, columns=["gender", "response"], drop_first=True)

In [19]:
df = spark.createDataFrame(df_pandas)

In [20]:
df.show()

+---------------+----+-------------+-------+----------+----------+------+-----+------------+--------------+-----------+------------+-----------+------------+
|           name| age|         city| salary|      date|experience|rating|score|gender_index|response_index|gender_Male|gender_Other|response_no|response_yes|
+---------------+----+-------------+-------+----------+----------+------+-----+------------+--------------+-----------+------------+-----------+------------+
|       John Doe|25.0|     New York|50000.0|2024-01-15|       3.0|   0.5|  1.5|         0.0|           0.0|       true|       false|      false|        true|
|     Jane Smith|30.0|  Los Angeles|68000.0|2023-05-20|       5.0|   1.1|  3.1|         1.0|           1.0|      false|       false|       true|       false|
|        Unknown|30.0|San Francisco|70000.0|2000-01-01|       2.0|   1.2|  2.0|         2.0|           1.0|      false|        true|       true|       false|
|Michael Johnson|40.0|      Unknown|80000.0|2022-08-

In [21]:
from pyspark.ml.feature import MinMaxScaler, VectorAssembler

In [22]:
assembler = VectorAssembler(inputCols=["salary", "experience", "rating", "score"], outputCol="features_vector")
df = assembler.transform(df)

In [23]:
scaler = MinMaxScaler(inputCol="features_vector", outputCol="features_scaled")
df = scaler.fit(df).transform(df)

In [24]:
from pyspark.ml.feature import StandardScaler

In [25]:
scaler_std = StandardScaler(inputCol="features_vector", outputCol="features_standardized", withStd=True, withMean=True)
df = scaler_std.fit(df).transform(df)

In [26]:
df = df.withColumn("year", year(col("date")))
df = df.withColumn("month", month(col("date")))
df = df.withColumn("day_of_week", dayofweek(col("date")))

In [27]:
df = df.withColumn("experience_score", col("experience") * col("score"))
df = df.withColumn("salary_rating_ratio", col("salary") / (col("rating") + 1))

In [28]:
df = df.withColumn("experience_squared", col("experience") ** 2)
df = df.withColumn("score_cubed", col("score") ** 3)

In [29]:
from pyspark.ml.feature import Bucketizer

In [30]:
splits = [0, 30, 40, 50, float("inf")]
bucketizer = Bucketizer(splits=splits, inputCol="age", outputCol="age_group")

In [31]:
df = bucketizer.transform(df)

In [32]:
df.show()

+---------------+----+-------------+-------+----------+----------+------+-----+------------+--------------+-----------+------------+-----------+------------+--------------------+--------------------+---------------------+----+-----+-----------+----------------+-------------------+------------------+------------------+---------+
|           name| age|         city| salary|      date|experience|rating|score|gender_index|response_index|gender_Male|gender_Other|response_no|response_yes|     features_vector|     features_scaled|features_standardized|year|month|day_of_week|experience_score|salary_rating_ratio|experience_squared|       score_cubed|age_group|
+---------------+----+-------------+-------+----------+----------+------+-----+------------+--------------+-----------+------------+-----------+------------+--------------------+--------------------+---------------------+----+-----+-----------+----------------+-------------------+------------------+------------------+---------+
|       Jo

In [34]:
from pyspark.ml.feature import PCA

In [35]:
pca = PCA(k=2, inputCol="features_vector", outputCol="pca_features")
df = pca.fit(df).transform(df)

In [36]:
df.show()
df.printSchema()

+---------------+----+-------------+-------+----------+----------+------+-----+------------+--------------+-----------+------------+-----------+------------+--------------------+--------------------+---------------------+----+-----+-----------+----------------+-------------------+------------------+------------------+---------+--------------------+
|           name| age|         city| salary|      date|experience|rating|score|gender_index|response_index|gender_Male|gender_Other|response_no|response_yes|     features_vector|     features_scaled|features_standardized|year|month|day_of_week|experience_score|salary_rating_ratio|experience_squared|       score_cubed|age_group|        pca_features|
+---------------+----+-------------+-------+----------+----------+------+-----+------------+--------------+-----------+------------+-----------+------------+--------------------+--------------------+---------------------+----+-----+-----------+----------------+-------------------+-----------------