In [1]:
!pip install pyspark

Defaulting to user installation because normal site-packages is not writeable


Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.


In [140]:
import pandas as pd
from pyspark.rdd import RDD
from pyspark.sql import  SparkSession,DataFrame
from pyspark.sql.types import StructType,StructField, StringType,IntegerType

In [97]:
spark = SparkSession.builder.appName("Gaurav").getOrCreate()

In [98]:
rdd1=spark.read.csv('titanic.csv',inferSchema=True,header=True)

In [99]:
rdd1.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|      

# Print Schema


In [100]:
rdd1.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [101]:
rdd1.count() #No. of rows

891

In [102]:
len(rdd1.columns) #No. of columns

12

In [103]:
d1=pd.read_csv('titanic.csv')
df1=pd.DataFrame({
    'PassengerID':d1['PassengerId'],
    'Name':d1['Name'],
    'Age':d1['Age'],
    'sex':d1['Sex']
})
df2=spark.sparkContext.parallelize(df1)
deptSchema = StructType([       
    StructField('PassengerId', StringType(), True),
    StructField('Name', StringType(), True),
    StructField('Age', IntegerType(), True),
    StructField('Sex', StringType(), True)
])
rdd2=spark.createDataFrame(df2,schema=deptSchema)

In [104]:
rdd2.printSchema()

root
 |-- PassengerId: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Sex: string (nullable = true)



In [125]:
#rdd2.show() Error

# Drop_Duplicate

In [127]:
print("Count "+str(rdd1.count()))
rdd3=rdd1.dropDuplicates()
rdd3.show()

Count 891
+-----------+--------+------+--------------------+------+----+-----+-----+------------------+-------+-------+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|            Ticket|   Fare|  Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+------------------+-------+-------+--------+
|        612|       0|     3|Jardin, Mr. Jose ...|  male|null|    0|    0|SOTON/O.Q. 3101305|   7.05|   null|       S|
|        666|       0|     2|  Hickman, Mr. Lewis|  male|32.0|    2|    0|      S.O.C. 14879|   73.5|   null|       S|
|        689|       0|     3|Fischer, Mr. Eber...|  male|18.0|    0|    0|            350036| 7.7958|   null|       S|
|        846|       0|     3| Abbing, Mr. Anthony|  male|42.0|    0|    0|         C.A. 5547|   7.55|   null|       S|
|         59|       1|     2|West, Miss. Const...|female| 5.0|    1|    2|        C.A. 34651|  27.75|   null|       S|
|         99|       1|     2|Doling, M

# dropping all records that have atleast one null/Nan values from the original dataset

In [131]:
rdd3.na.drop().show(20)

+-----------+--------+------+--------------------+------+----+-----+-----+--------+--------+-----------+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|  Ticket|    Fare|      Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+--------+--------+-----------+--------+
|        499|       0|     1|Allison, Mrs. Hud...|female|25.0|    1|    2|  113781|  151.55|    C22 C26|       S|
|         11|       1|     3|Sandstrom, Miss. ...|female| 4.0|    1|    1| PP 9549|    16.7|         G6|       S|
|        701|       1|     1|Astor, Mrs. John ...|female|18.0|    1|    0|PC 17757| 227.525|    C62 C64|       C|
|         55|       0|     1|Ostby, Mr. Engelh...|  male|65.0|    0|    1|  113509| 61.9792|        B30|       C|
|        332|       0|     1| Partner, Mr. Austen|  male|45.5|    0|    0|  113043|    28.5|       C124|       S|
|        646|       1|     1|Harper, Mr. Henry...|  male|48.0|    1|    0|PC 17572| 76.7

# dropping columns Ticket and Cabin from the original dataset

In [139]:
rdd4=rdd3.drop("Ticket","Cabin")

In [135]:
rdd4.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Embarked: string (nullable = true)



# Sort the original dataframe on the basis of Fare in descending order and store it into a new dataframe

In [149]:
rdd5=rdd4.sort(rdd4.Fare.desc())
rdd5.show()

+-----------+--------+------+--------------------+------+----+-----+-----+--------+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|    Fare|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+--------+--------+
|        738|       1|     1|Lesurer, Mr. Gust...|  male|35.0|    0|    0|512.3292|       C|
|        259|       1|     1|    Ward, Miss. Anna|female|35.0|    0|    0|512.3292|       C|
|        680|       1|     1|Cardeza, Mr. Thom...|  male|36.0|    0|    1|512.3292|       C|
|        342|       1|     1|Fortune, Miss. Al...|female|24.0|    3|    2|   263.0|       S|
|         28|       0|     1|Fortune, Mr. Char...|  male|19.0|    3|    2|   263.0|       S|
|        439|       0|     1|   Fortune, Mr. Mark|  male|64.0|    1|    4|   263.0|       S|
|         89|       1|     1|Fortune, Miss. Ma...|female|23.0|    3|    2|   263.0|       S|
|        312|       1|     1|Ryerson, Miss. Em...|female|18.0|    2|  

# contains only the passengers that survived

In [151]:
rdd6=rdd5.where("survived>0")
rdd6.show()

+-----------+--------+------+--------------------+------+----+-----+-----+--------+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|    Fare|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+--------+--------+
|        738|       1|     1|Lesurer, Mr. Gust...|  male|35.0|    0|    0|512.3292|       C|
|        259|       1|     1|    Ward, Miss. Anna|female|35.0|    0|    0|512.3292|       C|
|        680|       1|     1|Cardeza, Mr. Thom...|  male|36.0|    0|    1|512.3292|       C|
|         89|       1|     1|Fortune, Miss. Ma...|female|23.0|    3|    2|   263.0|       S|
|        342|       1|     1|Fortune, Miss. Al...|female|24.0|    3|    2|   263.0|       S|
|        743|       1|     1|"Ryerson, Miss. S...|female|21.0|    2|    2| 262.375|       C|
|        312|       1|     1|Ryerson, Miss. Em...|female|18.0|    2|    2| 262.375|       C|
|        300|       1|     1|Baxter, Mrs. Jame...|female|50.0|    0|  