In [86]:
#Importing all the necessary libraries

import findspark
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.ml.linalg import DenseVector
from pyspark.ml.feature import *
from pyspark.ml.regression import LinearRegression

In [87]:
#Initiating Spark and reading titanic csv

spark = SparkSession.builder.master("local").appName("Linear Regression Model").config("spark.executor.memory", "1gb").getOrCreate()

data = spark.read.csv('C:/Users/lemon/Downloads/titanic.csv', header=True)


data.first()
data.take(2)



[Row(Survived='0', Pclass='3', Name='Mr. Owen Harris Braund', Sex='male', Age='22', Siblings_Spouses_Aboard='1', Parents_Children_Aboard='0', Fare='7.25'),
 Row(Survived='1', Pclass='1', Name='Mrs. John Bradley (Florence Briggs Thayer) Cumings', Sex='female', Age='38', Siblings_Spouses_Aboard='1', Parents_Children_Aboard='0', Fare='71.2833')]

In [88]:
#Changing column types and names

data = data.select(col("Age").alias("Age"), col("Fare").alias("Fare"), col("Name").alias("Name")
                   , col("Parents_Children_Aboard").alias("Parents_Children_Aboard"), col("Pclass").alias("Pclass")
                   , col("Sex").alias("Sex"), col("Siblings_Spouses_Aboard").alias("Siblings_Spouses_Aboard")
                   , col("Survived").alias("Survived"))

df = data.withColumn("Age", data["Age"].cast(IntegerType()))\
    .withColumn("Fare", data["Fare"].cast(FloatType()))\
    .withColumn("Name", data["Name"].cast(StringType()))\
    .withColumn("Parents_Children_Aboard", data["Parents_Children_Aboard"].cast(IntegerType()))\
    .withColumn("Pclass", data["Pclass"].cast(IntegerType()))\
    .withColumn("Sex", data["Sex"].cast(StringType()))\
    .withColumn("Siblings_Spouses_Aboard", data["Siblings_Spouses_Aboard"].cast(IntegerType()))\
    .withColumn("Survived", data["Survived"].cast(IntegerType()))

df.first()

df.take(2)

df.show()


+---+-------+--------------------+-----------------------+------+------+-----------------------+--------+
|Age|   Fare|                Name|Parents_Children_Aboard|Pclass|   Sex|Siblings_Spouses_Aboard|Survived|
+---+-------+--------------------+-----------------------+------+------+-----------------------+--------+
| 22|   7.25|Mr. Owen Harris B...|                      0|     3|  male|                      1|       0|
| 38|71.2833|Mrs. John Bradley...|                      0|     1|female|                      1|       1|
| 26|  7.925|Miss. Laina Heikk...|                      0|     3|female|                      0|       1|
| 35|   53.1|Mrs. Jacques Heat...|                      0|     1|female|                      1|       1|
| 35|   8.05|Mr. William Henry...|                      0|     3|  male|                      0|       0|
| 27| 8.4583|     Mr. James Moran|                      0|     3|  male|                      0|       0|
| 54|51.8625|Mr. Timothy J McC...|            

In [89]:
# Counting how many survived based on sex and class and both

groupBySex = df.groupBy("Survived","Sex").count().show()
groupByClass = df.groupBy("Survived","Pclass").count().show()

groupByBoth = df.groupBy("Survived", "Sex", "Pclass").count().show()


+--------+------+-----+
|Survived|   Sex|count|
+--------+------+-----+
|       0|female|   81|
|       1|  male|  109|
|       1|female|  233|
|       0|  male|  464|
+--------+------+-----+

+--------+------+-----+
|Survived|Pclass|count|
+--------+------+-----+
|       1|     2|   87|
|       1|     1|  136|
|       1|     3|  119|
|       0|     1|   80|
|       0|     2|   97|
|       0|     3|  368|
+--------+------+-----+

+--------+------+------+-----+
|Survived|   Sex|Pclass|count|
+--------+------+------+-----+
|       0|female|     2|    6|
|       1|  male|     2|   17|
|       0|female|     3|   72|
|       0|  male|     2|   91|
|       0|female|     1|    3|
|       1|female|     3|   72|
|       1|  male|     3|   47|
|       1|female|     2|   70|
|       0|  male|     1|   77|
|       1|female|     1|   91|
|       1|  male|     1|   45|
|       0|  male|     3|  296|
+--------+------+------+-----+

