In [1]:
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"

In [1]:
#Import libraries as needed
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, col, first, last, lag, lead, when
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

conf = SparkConf().setAppName('yuck').setMaster("local[*]").set("spark.driver.memory", "4g")
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

In [3]:
#SCALED VERSION WITH BIG DATA SET STARTS HERE

from pyspark.sql.functions import unix_timestamp
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType
import os
# Define the schema for the CSV files
schema = StructType ([
 StructField ('date', StringType(), True ),
 StructField ('BEN', DoubleType(), True ),
 StructField ('CO', DoubleType(), True ),
 StructField ('EBE', DoubleType(), True ),
 StructField ('MXY', DoubleType(), True ),
 StructField ('NMHC', DoubleType(), True ),
 StructField ('NO_2', DoubleType(), True ),
 StructField ('NOx', DoubleType(), True ),
 StructField ('OXY', DoubleType(), True ),
 StructField ('O_3', DoubleType(), True ),
 StructField ('PM10', DoubleType(), True ),
 StructField ('PM25', DoubleType(), True ),
 StructField ('PXY', DoubleType(), True ),
 StructField ('SO_2', DoubleType(), True ),
 StructField ('TCH', DoubleType(), True ),
 StructField ('TOL', DoubleType(), True )])

In [4]:
# Create a list of CSV files to merge
data_path = 'C:\\Users\\eleni\\Documents\\Diplw\\Jupyter-Notebooks\\diplw\\csvs_per_year'
csv_files = [os.path.join(data_path,f) for f in os.listdir(data_path)]
             #if f.endswith ('.csv ') and f.startswith('madrid_20')]

In [17]:
# Load the CSV files into a Spark DataFrame
spark_df = spark.read.format('csv').schema(schema).option('header',True).load(csv_files)

In [18]:
# Convert the ’date’ column to Unix timestamps
from pyspark.sql.functions import unix_timestamp, to_utc_timestamp
from pyspark.sql.types import TimestampType
spark_df = spark_df.withColumn('unix_time', unix_timestamp(spark_df.date ,'yyyy-MM-dd HH:mm:ss').cast('timestamp')).drop('date')

In [19]:
# Order the DataFrame by unix_time and add a row number column
window = Window.orderBy('unix_time')
spark_df = spark_df.withColumn('row_num', row_number().over(window))

In [20]:
#FILL IN MISSING VALUES STARTS HERE
# Add previous and next value columns for each column except 'unix_time' and 'row_num'
for col_name in spark_df.columns:
    if col_name != "unix_time" and col_name != "row_num":
        spark_df = spark_df.withColumn(f"{col_name}_prev", lag(col_name).over(window))
        spark_df = spark_df.withColumn(f"{col_name}_next", lead(col_name).over(window))

In [21]:
# Interpolate missing values for each column except 'unix_time' and 'row_num'
for col_name in spark_df.columns:
    if col_name != "unix_time" and col_name != "row_num":
        spark_df = spark_df.withColumn(col_name, when(col(col_name).isNull(),
                                          (last(col_name, True).over(window) +
                                           first(col_name, True).over(window)) / 2)
                                          .otherwise(col(col_name)))

In [22]:
# Drop the columns with previous and next value for each column
for col_name in spark_df.columns:
    if col_name.endswith("_prev") or col_name.endswith("_next"):
        spark_df = spark_df.drop(col_name)
#END FILL IN MISSING VALUES

In [24]:
#Outlier Handling and normalization in apache spark

from pyspark.ml.feature import RobustScaler
# Get the columns to normalize
pollutants = spark_df.columns[:-2]
outliers = {}


In [25]:
for pollutant in pollutants:
    # Find outliers using the IQR method with k=1.5
    quantiles = spark_df.approxQuantile(pollutant, [0.25, 0.75], 0.05)
    Q1 = quantiles[0]
    Q3 = quantiles[1]
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers[pollutant] = (lower_bound, upper_bound)

In [55]:
spark_df_clean = spark_df.select(spark_df.columns)

In [56]:
# Replace outliers with null values
for pollutant in pollutants:
    # Rename the original column
    spark_df_clean = spark_df_clean.withColumnRenamed(pollutant, f"{pollutant}_orig")

    # Replace outliers with null values
    expr = when(~col(f"{pollutant}_orig").between(outliers[pollutant][0], outliers[pollutant][1]), None).otherwise(col(f"{pollutant}_orig")).alias(pollutant)
    spark_df_clean = spark_df_clean.select("*", expr)
    spark_df_clean = spark_df_clean.drop(f"{pollutant}_orig")
    spark_df_clean = spark_df_clean.fillna({f"{pollutant}": "null"})

In [57]:
#Interpolate->Fill null values of 'spark_df_clean'
# Order the DataFrame by unix_time and add a row number column
window_clean = Window.orderBy('unix_time')
spark_df_clean = spark_df_clean.withColumn('row_num', row_number().over(window_clean))

In [58]:
# Add previous and next value columns for each column except 'unix_time' and 'row_num'
for col_name in spark_df_clean.columns:
    if col_name != "unix_time" and col_name != "row_num":
        spark_df_clean = spark_df_clean.withColumn(f"{col_name}_prev", lag(col_name).over(window_clean))
        spark_df_clean = spark_df_clean.withColumn(f"{col_name}_next", lead(col_name).over(window_clean))

In [59]:
# Interpolate missing values for each column except 'unix_time' and 'row_num'
for col_name in spark_df_clean.columns:
    if col_name != "unix_time" and col_name != "row_num":
        spark_df_clean = spark_df_clean.withColumn(col_name, when(col(col_name).isNull(),
                                          (last(col_name, True).over(window_clean) +
                                           first(col_name, True).over(window_clean)) / 2)
                                          .otherwise(col(col_name)))

In [60]:
# Drop the columns with previous and next value for each column
for col_name in spark_df_clean.columns:
    if col_name.endswith("_prev") or col_name.endswith("_next"):
        spark_df_clean = spark_df_clean.drop(col_name)


In [None]:
#Check if interpolation successful
from pyspark.sql.functions import isnan, when, count

# Count the number of null values in each column
null_counts = spark_df_clean.select([count(when(isnan(c) | spark_df_clean[c].isNull(), c)).alias(c) for c in spark_df_clean.columns])

# Show the results
null_counts.show()
#END FILL IN MISSING VALUES

In [46]:
#Normalization
from pyspark.ml.feature import StandardScaler, VectorAssembler

# Assemble features into a single column
assembler = VectorAssembler(inputCols=pollutants, outputCol="features")
spark_df_assembled = assembler.transform(spark_df_clean).select("features")

In [48]:
# Scale features using standard scaler
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withMean=True, withStd=True)
#scaler_model = scaler.fit(spark_df_assembled)
#spark_df_normalized = scaler_model.transform(spark_df_assembled).select("scaledFeatures")

In [None]:
#Leave this area for debugging, printing rows,values,columns etc.

In [None]:
#sample spark_df to visualize data

sampled_data = spark_df.select('SO_2', 'NO_2', 'PM25', 'PM10', 'O_3').sample(False, 0.2) #20% of the data
pandas_df = sampled_data.toPandas()

In [None]:
#convert data type of pandas_df
for col in pandas_df.columns:
    # check if the column contains float64 data type
    if pandas_df[col].dtype == np.float64:
        # convert the column to float32 data type
        pandas_df[col] = pandas_df[col].astype(np.float16)

In [None]:
memory_per_column = pandas_df.memory_usage(deep=True)

# get the total memory usage of the DataFrame
total_memory = memory_per_column.sum()

print(f"Memory usage of DataFrame: {total_memory / 1024**2:.2f} MB")

In [None]:
#histograms for only selected pollutants
pandas_df = pandas_df.reset_index(drop=True)

# Create a 2x3 grid of subplots
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(12, 8))

# Loop through the pollutants and plot the histograms in each subplot
for i, pollutant in enumerate(pandas_df.columns):
    row = i // 3
    col = i % 3
    ax = axes[row, col]
    sns.histplot(data=pandas_df, x=pollutant, kde=True, color="purple",bins=500, ax=ax)
    ax.set_title(pollutant)
    # get the range of values in the column, ignoring NaN and Inf
    x_min = np.nanmin(pandas_df[pollutant][np.isfinite(pandas_df[pollutant])])
    x_max = np.nanmax(pandas_df[pollutant][np.isfinite(pandas_df[pollutant])])

    # set the x-axis limits
    ax.set_xlim(x_min, x_max/1.5)
    
plt.tight_layout()
plt.show()


In [None]:
#Pathainei ligo kokobloko opote tha prospathisw na kanw histogram gia kathe column ksexwrista gamwt
sns.histplot(data=pandas_df, x='PM25', kde=True, color='purple',bins=500)
# set the x-axis and y-axis limits
plt.xlim(0, 60)
plt.ylim(0, 300000)

# show the plot
plt.show()

In [8]:
#Checking dataframe size and cores to optimize partitions
import psutil

num_cores = psutil.cpu_count(logical=False)
print("Number of physical CPU cores:", num_cores)

Number of physical CPU cores: 4


In [21]:
#Check number of partitions
num_partitions = spark_df.rdd.getNumPartitions()
print("Number of partitions:", num_partitions)

Number of partitions: 10


In [None]:
spark_df = spark_df.repartition(10)  # change 10 to the desired number of partitions
print(spark_df.rdd.getNumPartitions())

In [None]:
print(spark_df.columns)


In [None]:
print(spark_df_clean.columns)

In [None]:
print(pollutants)

In [None]:
from pyspark.sql.functions import isnan, col

# Count the number of null values in the 'TOL' column of 'spark_df'
tol_null_count = spark_df.filter(isnan(col('TOL'))).count()

print(f"The 'TOL' column of 'spark_df' has {tol_null_count} null values.")


In [None]:
spark_df_clean.show(10)

In [None]:
from pyspark.sql.functions import isnull

# Count the number of null values in each column
spark_df_clean.select([isnull(c).alias(c) for c in spark_df_clean.columns]).count()


In [None]:
spark_df.show(10)


In [43]:
spark_df = spark_df.drop(spark_df.columns[-1])

In [None]:
# Write the merged DataFrame to a CSV file
spark_df.coalesce(1).write.format('csv').option('header', True ).mode('overwrite').save('C:\\Users\\eleni\\Documents\\Diplw\\Jupyter-Notebooks\\diplw\\csvs_per_year\\yuck')