In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline,Transformer
from pyspark.ml.feature import Imputer,StandardScaler,StringIndexer,OneHotEncoder, VectorAssembler
from pyspark.sql.functions import *
from pyspark.sql.types import *
import numpy as np
import os
import sys

In [3]:
appName = "Big Data Analytics"
master = "local[*]"

# Create Configuration object for Spark.
conf = pyspark.SparkConf()\
    .set('spark.driver.host','127.0.0.1')\
    .set("spark.drive.memory", "10g")\
    .set("spark.executor.memory", "10g")\
    .setAppName(appName)\
    .setMaster(master)

spark = SparkSession.builder.config(conf = conf).getOrCreate()

In [4]:
db_properties={}
db_properties['username']="postgres"
db_properties['password']="systems"
db_properties['url']= "jdbc:postgresql://localhost:5432/postgres"
db_properties['table']="mqtt.mqtt"
db_properties['driver']="org.postgresql.Driver"

In [None]:
train_raw = spark.read.csv('./Data/FINAL_CSV/train70.csv',header = True)
test_raw = spark.read.csv('./Data/FINAL_CSV/test30.csv', header = True)

In [7]:
# Add dataset_type column
train = train_raw.withColumn("dataset_type", lit("train"))
test = test_raw.withColumn("dataset_type", lit("test"))

In [8]:
# Combine datasets
combined_df = train.union(test)

In [None]:
# Write to PostgreSQL

combined_df.write.format("jdbc")\
.mode("overwrite")\
.option("url", db_properties['url'])\
.option("dbtable", db_properties['table'])\
.option("user", db_properties['username'])\
.option("password", db_properties['password'])\
.option("Driver", db_properties['driver'])\
.save()

In [5]:
# Read from PostgreSQL to verify
df_from_postgres = spark.read \
    .format("jdbc") \
    .option("url", db_properties['url'])\
    .option("dbtable", db_properties['table'])\
    .option("user", db_properties['username'])\
    .option("password", db_properties['password'])\
    .option("Driver", db_properties['driver'])\
    .load()

In [6]:
df_from_postgres.printSchema()

root
 |-- tcp.flags: string (nullable = true)
 |-- tcp.time_delta: string (nullable = true)
 |-- tcp.len: string (nullable = true)
 |-- mqtt.conack.flags: string (nullable = true)
 |-- mqtt.conack.flags.reserved: string (nullable = true)
 |-- mqtt.conack.flags.sp: string (nullable = true)
 |-- mqtt.conack.val: string (nullable = true)
 |-- mqtt.conflag.cleansess: string (nullable = true)
 |-- mqtt.conflag.passwd: string (nullable = true)
 |-- mqtt.conflag.qos: string (nullable = true)
 |-- mqtt.conflag.reserved: string (nullable = true)
 |-- mqtt.conflag.retain: string (nullable = true)
 |-- mqtt.conflag.uname: string (nullable = true)
 |-- mqtt.conflag.willflag: string (nullable = true)
 |-- mqtt.conflags: string (nullable = true)
 |-- mqtt.dupflag: string (nullable = true)
 |-- mqtt.hdrflags: string (nullable = true)
 |-- mqtt.kalive: string (nullable = true)
 |-- mqtt.len: string (nullable = true)
 |-- mqtt.msg: string (nullable = true)
 |-- mqtt.msgid: string (nullable = true)
 |--

In [7]:
df_from_postgres.count()

12081189

In [8]:
df_from_postgres.limit(1).show()