In [1]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

In [39]:
df = spark.read \
    .option("header", "true") \
    .csv ('dataset.csv')

df.printSchema()

root
 |-- accessed_date: string (nullable = true)
 |-- duration_(secs): string (nullable = true)
 |-- network_protocol: string (nullable = true)
 |-- ip: string (nullable = true)
 |-- bytes: string (nullable = true)
 |-- accessed_Ffom: string (nullable = true)
 |-- age: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- country: string (nullable = true)
 |-- membership: string (nullable = true)
 |-- language: string (nullable = true)
 |-- sales: string (nullable = true)
 |-- returned: string (nullable = true)
 |-- returned_amount: string (nullable = true)
 |-- pay_method: string (nullable = true)



In [40]:
from pyspark.sql import types

schema = types.StructType([
    types.StructField('accessed_date', types.TimestampType(), True),
    types.StructField('duration_(secs)', types.IntegerType(), True),
    types.StructField('network_protocol', types.StringType(), True),
    types.StructField('ip', types.StringType(), True),
    types.StructField('bytes', types.IntegerType(), True),
    types.StructField('accessed_Ffom', types.StringType(), True),
    types.StructField('age', types.IntegerType(), True),
    types.StructField('gender', types.StringType(), True),
    types.StructField('country', types.StringType(), True),
    types.StructField('membership', types.StringType(), True),
    types.StructField('language', types.StringType(), True),
    types.StructField('sales', types.DoubleType(), True),
    types.StructField('returned', types.StringType(), True),
    types.StructField('returned_amount', types.DoubleType(), True),
    types.StructField('pay_method', types.StringType(), True)
])

df = spark.read \
    .option("header", "true") \
    .schema(schema) \
    .csv ('dataset.csv')

df =(
    df \
            .withColumnRenamed('duration_(secs)', 'duration_sec') \
            .withColumnRenamed('accessed_Ffom', 'accessed_from') \
            .withColumnRenamed('membership', 'account_type') \
            .withColumnRenamed('returned', 'refunded') \
            .withColumnRenamed('returned_amount', 'refunded_amount') \
            .withColumnRenamed('pay_method', 'payment_method')
)

df.printSchema()

root
 |-- accessed_date: timestamp (nullable = true)
 |-- duration_sec: integer (nullable = true)
 |-- network_protocol: string (nullable = true)
 |-- ip: string (nullable = true)
 |-- bytes: integer (nullable = true)
 |-- accessed_from: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- country: string (nullable = true)
 |-- account_type: string (nullable = true)
 |-- language: string (nullable = true)
 |-- sales: double (nullable = true)
 |-- refunded: string (nullable = true)
 |-- refunded_amount: double (nullable = true)
 |-- payment_method: string (nullable = true)



In [41]:
df = df.repartition(10)

df.write.parquet('dataset_repartitioned_parquet')

df = spark.read.parquet('dataset_repartitioned_parquet')

df.show()

+--------------------+------------+----------------+---------------+-----+---------------+----+-------+-------+-------------+--------+-------+--------+---------------+--------------+
|       accessed_date|duration_sec|network_protocol|             ip|bytes|  accessed_from| age| gender|country| account_type|language|  sales|refunded|refunded_amount|payment_method|
+--------------------+------------+----------------+---------------+-----+---------------+----+-------+-------+-------------+--------+-------+--------+---------------+--------------+
|2017-03-15 05:42:...|        2020|            HTTP|    72.5.252.51| 2267|Mozilla Firefox|  51| Female|     PR|      Premium| Chinese| 224.75|      No|            0.0|          Cash|
|2017-03-16 08:18:...|        4155|           TCP  |   95.88.20.251|   46|    Android App|  37|   Male|     IT|      Premium| English|  58.65|      No|            0.0|   Credit Card|
|2017-03-16 00:05:...|        3980|           TCP  |    5.9.145.132|  556|    Android