In [7]:
from time import sleep

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, struct,udf
from pyspark.sql.types import DateType
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql.functions import date_format, avg, max, desc, min

# creating pyspark session

In [5]:
spark = (
    SparkSession.builder
    .appName("Titanic Analysis")
    .getOrCreate()
)

# Get Spark Context
print(spark.sparkContext)

<SparkContext master=local[*] appName=Titanic Analysis>


# reading csv, formatting with inferSchema and print schema

In [6]:
# reading csv from current directorty
titanic_df =spark.read.csv("./titanic.csv", inferSchema=True, header= True)
# printing schema
titanic_df.printSchema()
#showing the dataframe
titanic_df.show()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Passengerclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- TimeStream: string (nullable = true)

+-----------+--------+--------------+--------------------+------+----+-----+-----+----------------+-------+-----+----------+
|PassengerId|Survived|Passengerclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|TimeStream|
+-----------+--------+--------------+--------------------+------+----+-----+-----+----------------+-------+-----+----------+
|          1|       0|             3|Braund, Mr. Owen ...|  male|  22|    1|    0|       A/5 21171|   7.25| null|         S|
|          2|       1|    

# For numerical columns, calculate minimum, maximum and average values.

In [7]:
# fetching columns from data frame which row values are numerical and putting them in list
numerical_col = [col_name for col_name, col_type in titanic_df.dtypes if col_type == "int" or col_type == "double"]

# this will select only those columns from data frame which are in above 
#list excluding first row as it is index and we dnt want to calculate operation of index column
numerical_df = titanic_df.select(*numerical_col[1:])
numerical_df.show()

+--------+--------------+----+-----+-----+-------+
|Survived|Passengerclass| Age|SibSp|Parch|   Fare|
+--------+--------------+----+-----+-----+-------+
|       0|             3|  22|    1|    0|   7.25|
|       1|             1|  38|    1|    0|71.2833|
|       1|             3|  26|    0|    0|  7.925|
|       1|             1|  35|    1|    0|   53.1|
|       0|             3|  35|    0|    0|   8.05|
|       0|             3|null|    0|    0| 8.4583|
|       0|             1|  54|    0|    0|51.8625|
|       0|             3|   2|    3|    1| 21.075|
|       1|             3|  27|    0|    2|11.1333|
|       1|             2|  14|    1|    0|30.0708|
|       1|             3|   4|    1|    1|   16.7|
|       1|             1|  58|    0|    0|  26.55|
|       0|             3|  20|    0|    0|   8.05|
|       0|             3|  39|    1|    5| 31.275|
|       0|             3|  14|    0|    0| 7.8542|
|       1|             2|  55|    0|    0|   16.0|
|       0|             3|   2| 

In [8]:
# calculating max values of all columns in dataframe and putting it into new dataframe
max_values = numerical_df.agg(*[max(column).alias(column) for column in numerical_df.columns])
max_values.show()

+--------+--------------+---+-----+-----+--------+
|Survived|Passengerclass|Age|SibSp|Parch|    Fare|
+--------+--------------+---+-----+-----+--------+
|       1|             3| 80|    8|    6|512.3292|
+--------+--------------+---+-----+-----+--------+



In [9]:
# calculating avg values of all columns in dataframe and putting it into new dataframe
avg_values = numerical_df.agg(*[avg(column).alias(column) for column in numerical_df.columns])
avg_values.show()

+------------------+-----------------+------------------+------------------+-------------------+----------------+
|          Survived|   Passengerclass|               Age|             SibSp|              Parch|            Fare|
+------------------+-----------------+------------------+------------------+-------------------+----------------+
|0.3838383838383838|2.308641975308642|29.679271708683473|0.5230078563411896|0.38159371492704824|32.2042079685746|
+------------------+-----------------+------------------+------------------+-------------------+----------------+



In [10]:
# calculating min values of all columns in dataframe and putting it into new dataframe

min_values = numerical_df.agg(*[min(column).alias(column) for column in numerical_df.columns])
min_values.show()

+--------+--------------+---+-----+-----+----+
|Survived|Passengerclass|Age|SibSp|Parch|Fare|
+--------+--------------+---+-----+-----+----+
|       0|             1|  0|    0|    0| 0.0|
+--------+--------------+---+-----+-----+----+



# For categorical columns, create and apply UDF that will change the last letter of every word to “1”.

In [11]:
titanic_df.show()

+-----------+--------+--------------+--------------------+------+----+-----+-----+----------------+-------+-----+----------+
|PassengerId|Survived|Passengerclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|TimeStream|
+-----------+--------+--------------+--------------------+------+----+-----+-----+----------------+-------+-----+----------+
|          1|       0|             3|Braund, Mr. Owen ...|  male|  22|    1|    0|       A/5 21171|   7.25| null|         S|
|          2|       1|             1|Cumings, Mrs. Joh...|female|  38|    1|    0|        PC 17599|71.2833|  C85|         C|
|          3|       1|             3|Heikkinen, Miss. ...|female|  26|    0|    0|STON/O2. 3101282|  7.925| null|         S|
|          4|       1|             1|Futrelle, Mrs. Ja...|female|  35|    1|    0|          113803|   53.1| C123|         S|
|          5|       0|             3|Allen, Mr. Willia...|  male|  35|    0|    0|          373450|   8.05| null|         S|


In [14]:
# IN our output dataframe 5 first cols are the categorical cols that have to change and the next 5 are those that are changed
# As we can see col with only 1 char is replaced with 1 and others have changed its last char, null values will remain null

In [13]:
# loading dataframe from csv
titanic_df =spark.read.csv("./titanic.csv", header= True)

# selecting and making separate dataframe with only categorical columns of above loaded titanic_df
categorical_df = titanic_df.select("Survived","Passengerclass","Sex","TimeStream","Cabin")

# print to look at the dtype of column as we want them to be string
categorical_df.printSchema()

# function to put into UDF that will return value with last character changed to 1 if it is not null
def change_last_character(value):
    # if value is null return the same value as it is
    if value is None:
        return value
    # else remove last char with slicing and add 1 and return it
    else:
        return value[:-1] + "1"

# put the function we created in udf and the return type of the function which is string
change_last_character_udf = udf(change_last_character, returnType=StringType())

# now i m looping over columns to make changes all together to all columns of categorical_df
for col_name in categorical_df.columns:
    # we are adding modified column in the same dataframe we need a unique name for each column
    modified_col_name = col_name + "_up"
    
    # adding column with withcolumn with para( col name, udf we created , and adding function with col name in udf
    # as we are iterating our col_name will be changed in every iteration
    categorical_df = categorical_df.withColumn(modified_col_name, change_last_character_udf(categorical_df[col_name]))
    
#categorical_df = categorical_df.withColumn("result", change_last_character_udf(struct(categorical_df.columns)))

print("")
categorical_df.show()

root
 |-- Survived: string (nullable = true)
 |-- Passengerclass: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- TimeStream: string (nullable = true)
 |-- Cabin: string (nullable = true)

+--------+--------------+------+----------+-----+-----------+-----------------+------+-------------+--------+
|Survived|Passengerclass|   Sex|TimeStream|Cabin|Survived_up|Passengerclass_up|Sex_up|TimeStream_up|Cabin_up|
+--------+--------------+------+----------+-----+-----------+-----------------+------+-------------+--------+
|       0|             3|  male|         S| null|          1|                1|  mal1|            1|    null|
|       1|             1|female|         C|  C85|          1|                1|femal1|            1|     C81|
|       1|             3|female|         S| null|          1|                1|femal1|            1|    null|
|       1|             1|female|         S| C123|          1|                1|femal1|            1|    C121|
|       0|             3

# Sort DataFrame by the first column and save the results to the Parquet file.

In [20]:
# Assuming that dataframe here is our output categorical dataframe on above dataframe

# sorting the dataframe by survived column 
sorted_df = categorical_df.sort("Survived")
sorted_df.show()
# saving to parquet
sorted_df.write.parquet("categorical_df.parquet")


+--------+--------------+------+----------+-----+-----------+-----------------+------+-------------+--------+
|Survived|Passengerclass|   Sex|TimeStream|Cabin|Survived_up|Passengerclass_up|Sex_up|TimeStream_up|Cabin_up|
+--------+--------------+------+----------+-----+-----------+-----------------+------+-------------+--------+
|       0|             3|female|         S| null|          1|                1|femal1|            1|    null|
|       0|             3|  male|         C| null|          1|                1|  mal1|            1|    null|
|       0|             3|  male|         C| null|          1|                1|  mal1|            1|    null|
|       0|             2|  male|         S| null|          1|                1|  mal1|            1|    null|
|       0|             1|  male|         C| null|          1|                1|  mal1|            1|    null|
|       0|             3|  male|         S| null|          1|                1|  mal1|            1|    null|
|       0|