In [1]:
!pip install pyspark



In [2]:
import pyspark

In [3]:
import pandas as pd
df = pd.read_csv('titanic.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.appName("pratice").getOrCreate()

In [6]:
spark

In [7]:
# Quanto se tem cabeçalho, a opção header deve ser informada como "True".
# Se inferSchema não for informado ("False" por padrão), todas os valores serão importados como string
df_1 = spark.read.csv('titanic.csv', header=True)
df_1.printSchema()

root
 |-- PassengerId: string (nullable = true)
 |-- Survived: string (nullable = true)
 |-- Pclass: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- SibSp: string (nullable = true)
 |-- Parch: string (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: string (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [8]:
df_1 = spark.read.csv('titanic.csv', header=True, inferSchema=True)
df_1.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



Uma outra opção de realizar a leitura é mostrada abaixo.

In [9]:
df_pyspark = spark.read.option('header', 'True').csv('titanic.csv', inferSchema=True)

In [10]:
# Tipo do Dataframe
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [11]:
# No caso do dataframe pyspark.sql, quando se usar .head(x),
# são mostradas as x linhas com o valor associado a cada uma das colunas
df_pyspark.head(5)

[Row(PassengerId=1, Survived=0, Pclass=3, Name='Braund, Mr. Owen Harris', Sex='male', Age=22.0, SibSp=1, Parch=0, Ticket='A/5 21171', Fare=7.25, Cabin=None, Embarked='S'),
 Row(PassengerId=2, Survived=1, Pclass=1, Name='Cumings, Mrs. John Bradley (Florence Briggs Thayer)', Sex='female', Age=38.0, SibSp=1, Parch=0, Ticket='PC 17599', Fare=71.2833, Cabin='C85', Embarked='C'),
 Row(PassengerId=3, Survived=1, Pclass=3, Name='Heikkinen, Miss. Laina', Sex='female', Age=26.0, SibSp=0, Parch=0, Ticket='STON/O2. 3101282', Fare=7.925, Cabin=None, Embarked='S'),
 Row(PassengerId=4, Survived=1, Pclass=1, Name='Futrelle, Mrs. Jacques Heath (Lily May Peel)', Sex='female', Age=35.0, SibSp=1, Parch=0, Ticket='113803', Fare=53.1, Cabin='C123', Embarked='S'),
 Row(PassengerId=5, Survived=0, Pclass=3, Name='Allen, Mr. William Henry', Sex='male', Age=35.0, SibSp=0, Parch=0, Ticket='373450', Fare=8.05, Cabin=None, Embarked='S')]

In [12]:
# Similar ao .info() do pandas em que mostra o tipo da coluna.
df_pyspark.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [13]:
# Ainda, podemos usar dtypes para mostrar os tipos de cada feature
df_pyspark.dtypes

[('PassengerId', 'int'),
 ('Survived', 'int'),
 ('Pclass', 'int'),
 ('Name', 'string'),
 ('Sex', 'string'),
 ('Age', 'double'),
 ('SibSp', 'int'),
 ('Parch', 'int'),
 ('Ticket', 'string'),
 ('Fare', 'double'),
 ('Cabin', 'string'),
 ('Embarked', 'string')]

In [14]:
# lista com o nome das colunas

nome_colunas = df_pyspark.columns
nome_colunas, type(nome_colunas)

(['PassengerId',
  'Survived',
  'Pclass',
  'Name',
  'Sex',
  'Age',
  'SibSp',
  'Parch',
  'Ticket',
  'Fare',
  'Cabin',
  'Embarked'],
 list)

In [15]:
# Selecionando colunas
a = df_pyspark.select(['PassengerId', 'Survived'])

In [16]:
a.show()

+-----------+--------+
|PassengerId|Survived|
+-----------+--------+
|          1|       0|
|          2|       1|
|          3|       1|
|          4|       1|
|          5|       0|
|          6|       0|
|          7|       0|
|          8|       0|
|          9|       1|
|         10|       1|
|         11|       1|
|         12|       1|
|         13|       0|
|         14|       0|
|         15|       0|
|         16|       1|
|         17|       0|
|         18|       1|
|         19|       0|
|         20|       1|
+-----------+--------+
only showing top 20 rows



In [17]:
df_pyspark.describe().show()

+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|summary|      PassengerId|           Survived|            Pclass|                Name|   Sex|               Age|             SibSp|              Parch|            Ticket|             Fare|Cabin|Embarked|
+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|  count|              891|                891|               891|                 891|   891|               714|               891|                891|               891|              891|  204|     889|
|   mean|            446.0| 0.3838383838383838| 2.308641975308642|                null|  null| 29.69911764705882|0.5230078563411896|0.38159371492704824|260318.54916792738| 32.20420

In [18]:
## Adicionando colunas usando pyspark

a = df_pyspark.withColumn('PassengerId_novo', df_pyspark['PassengerId']+2)

In [19]:
a.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+----------------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|PassengerId_novo|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+----------------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|               3|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|               4|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|               5|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|               6|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|  

In [20]:
# removendo colunas

a = df_pyspark.drop('PassengerId_novo')

In [21]:
a.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+------

In [22]:
a = a.drop('Name')

In [23]:
a.show(5)

+-----------+--------+------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|  male|35.0|    0|    0|          373450|   8.05| null|       S|
+-----------+--------+------+------+----+-----+-----+----------------+-------+-----+--------+
only showing top 5 rows



In [24]:
# Renomeando colunas

a = a.withColumnRenamed('PassengerId', 'Id')

In [25]:
a.show()

+---+--------+------+------+----+-----+-----+----------------+-------+-----+--------+
| Id|Survived|Pclass|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+---+--------+------+------+----+-----+-----+----------------+-------+-----+--------+
|  1|       0|     3|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|  2|       1|     1|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|  3|       1|     3|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|  4|       1|     1|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|  5|       0|     3|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|  6|       0|     3|  male|null|    0|    0|          330877| 8.4583| null|       Q|
|  7|       0|     1|  male|54.0|    0|    0|           17463|51.8625|  E46|       S|
|  8|       0|     3|  male| 2.0|    3|    1|          349909| 21.075| null|       S|
|  9|       1|     3|female|27.0|    0|    2|         

In [27]:
# Remover linhas com valores nulos.
# Nesse caso, qualquer linha que apresentar um valor nulo em alguma coluna, será removida

a.na.drop().show()

+---+--------+------+------+----+-----+-----+-----------+--------+-----------+--------+
| Id|Survived|Pclass|   Sex| Age|SibSp|Parch|     Ticket|    Fare|      Cabin|Embarked|
+---+--------+------+------+----+-----+-----+-----------+--------+-----------+--------+
|  2|       1|     1|female|38.0|    1|    0|   PC 17599| 71.2833|        C85|       C|
|  4|       1|     1|female|35.0|    1|    0|     113803|    53.1|       C123|       S|
|  7|       0|     1|  male|54.0|    0|    0|      17463| 51.8625|        E46|       S|
| 11|       1|     3|female| 4.0|    1|    1|    PP 9549|    16.7|         G6|       S|
| 12|       1|     1|female|58.0|    0|    0|     113783|   26.55|       C103|       S|
| 22|       1|     2|  male|34.0|    0|    0|     248698|    13.0|        D56|       S|
| 24|       1|     1|  male|28.0|    0|    0|     113788|    35.5|         A6|       S|
| 28|       0|     1|  male|19.0|    3|    2|      19950|   263.0|C23 C25 C27|       S|
| 53|       1|     1|female|49.0

In [29]:
# Remover linhas com valores nulos em todas as colunas

a.na.drop(how='all').show()

+---+--------+------+------+----+-----+-----+----------------+-------+-----+--------+
| Id|Survived|Pclass|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+---+--------+------+------+----+-----+-----+----------------+-------+-----+--------+
|  1|       0|     3|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|  2|       1|     1|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|  3|       1|     3|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|  4|       1|     1|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|  5|       0|     3|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|  6|       0|     3|  male|null|    0|    0|          330877| 8.4583| null|       Q|
|  7|       0|     1|  male|54.0|    0|    0|           17463|51.8625|  E46|       S|
|  8|       0|     3|  male| 2.0|    3|    1|          349909| 21.075| null|       S|
|  9|       1|     3|female|27.0|    0|    2|         

In [30]:
# Remover linhas com valores nulos (how = 'any' por padrão)

a.na.drop(how='any').show()

+---+--------+------+------+----+-----+-----+-----------+--------+-----------+--------+
| Id|Survived|Pclass|   Sex| Age|SibSp|Parch|     Ticket|    Fare|      Cabin|Embarked|
+---+--------+------+------+----+-----+-----+-----------+--------+-----------+--------+
|  2|       1|     1|female|38.0|    1|    0|   PC 17599| 71.2833|        C85|       C|
|  4|       1|     1|female|35.0|    1|    0|     113803|    53.1|       C123|       S|
|  7|       0|     1|  male|54.0|    0|    0|      17463| 51.8625|        E46|       S|
| 11|       1|     3|female| 4.0|    1|    1|    PP 9549|    16.7|         G6|       S|
| 12|       1|     1|female|58.0|    0|    0|     113783|   26.55|       C103|       S|
| 22|       1|     2|  male|34.0|    0|    0|     248698|    13.0|        D56|       S|
| 24|       1|     1|  male|28.0|    0|    0|     113788|    35.5|         A6|       S|
| 28|       0|     1|  male|19.0|    3|    2|      19950|   263.0|C23 C25 C27|       S|
| 53|       1|     1|female|49.0

In [33]:
# Remover linhas com valores nulos (how = 'any' por padrão)
# threshold = thresh = x. Especifica o número de valores não nulos que uma linha deve
# conter para não ser deletada.

a.na.drop(how='any', thresh=10).show()

+---+--------+------+------+----+-----+-----+----------------+-------+-----+--------+
| Id|Survived|Pclass|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+---+--------+------+------+----+-----+-----+----------------+-------+-----+--------+
|  1|       0|     3|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|  2|       1|     1|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|  3|       1|     3|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|  4|       1|     1|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|  5|       0|     3|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|  7|       0|     1|  male|54.0|    0|    0|           17463|51.8625|  E46|       S|
|  8|       0|     3|  male| 2.0|    3|    1|          349909| 21.075| null|       S|
|  9|       1|     3|female|27.0|    0|    2|          347742|11.1333| null|       S|
| 10|       1|     2|female|14.0|    1|    0|         

In [35]:
# Remover linhas com valores nulos (how = 'any' por padrão)
# subset permite selecionar uma coluna específica. Sempre que a linha apresentar valor nulo naquela coluna
# a linha será deletada

a.na.drop(how='any', subset=['Cabin', 'Ticket']).show()

+---+--------+------+------+----+-----+-----+-----------+--------+-----------+--------+
| Id|Survived|Pclass|   Sex| Age|SibSp|Parch|     Ticket|    Fare|      Cabin|Embarked|
+---+--------+------+------+----+-----+-----+-----------+--------+-----------+--------+
|  2|       1|     1|female|38.0|    1|    0|   PC 17599| 71.2833|        C85|       C|
|  4|       1|     1|female|35.0|    1|    0|     113803|    53.1|       C123|       S|
|  7|       0|     1|  male|54.0|    0|    0|      17463| 51.8625|        E46|       S|
| 11|       1|     3|female| 4.0|    1|    1|    PP 9549|    16.7|         G6|       S|
| 12|       1|     1|female|58.0|    0|    0|     113783|   26.55|       C103|       S|
| 22|       1|     2|  male|34.0|    0|    0|     248698|    13.0|        D56|       S|
| 24|       1|     1|  male|28.0|    0|    0|     113788|    35.5|         A6|       S|
| 28|       0|     1|  male|19.0|    3|    2|      19950|   263.0|C23 C25 C27|       S|
| 32|       1|     1|female|null

In [37]:
# Preenchendo valores nulos das colunas

a.na.fill('PREENCHIDO', ['Cabin', 'Ticket']).show()

+---+--------+------+------+----+-----+-----+----------------+-------+----------+--------+
| Id|Survived|Pclass|   Sex| Age|SibSp|Parch|          Ticket|   Fare|     Cabin|Embarked|
+---+--------+------+------+----+-----+-----+----------------+-------+----------+--------+
|  1|       0|     3|  male|22.0|    1|    0|       A/5 21171|   7.25|PREENCHIDO|       S|
|  2|       1|     1|female|38.0|    1|    0|        PC 17599|71.2833|       C85|       C|
|  3|       1|     3|female|26.0|    0|    0|STON/O2. 3101282|  7.925|PREENCHIDO|       S|
|  4|       1|     1|female|35.0|    1|    0|          113803|   53.1|      C123|       S|
|  5|       0|     3|  male|35.0|    0|    0|          373450|   8.05|PREENCHIDO|       S|
|  6|       0|     3|  male|null|    0|    0|          330877| 8.4583|PREENCHIDO|       Q|
|  7|       0|     1|  male|54.0|    0|    0|           17463|51.8625|       E46|       S|
|  8|       0|     3|  male| 2.0|    3|    1|          349909| 21.075|PREENCHIDO|       S|

In [39]:
# Preenchendo valores usando a média. Função built-in pyspark Imputer

from pyspark.ml.feature import Imputer

imputer = Imputer(inputCols = ['Age', 'Fare'],
                 outputCols = ["{}_imputed".format(c) for c in ['Age', 'Fare']]).setStrategy("mean")

# podemos usar "mean", "median" ou "modal"

In [40]:
imputer.fit(a).transform(a).show()

+---+--------+------+------+----+-----+-----+----------------+-------+-----+--------+-----------------+------------+
| Id|Survived|Pclass|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|      Age_imputed|Fare_imputed|
+---+--------+------+------+----+-----+-----+----------------+-------+-----+--------+-----------------+------------+
|  1|       0|     3|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|             22.0|        7.25|
|  2|       1|     1|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|             38.0|     71.2833|
|  3|       1|     3|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|             26.0|       7.925|
|  4|       1|     1|female|35.0|    1|    0|          113803|   53.1| C123|       S|             35.0|        53.1|
|  5|       0|     3|  male|35.0|    0|    0|          373450|   8.05| null|       S|             35.0|        8.05|
|  6|       0|     3|  male|null|    0|    0|          330877| 8

In [41]:
# Preenchendo valores usando a média. Função built-in pyspark Imputer

from pyspark.ml.feature import Imputer

imputer = Imputer(inputCols = ['Age', 'Fare'],
                 outputCols = ['Age', 'Fare']).setStrategy("mean")

# podemos usar "mean", "median" ou "modal"

imputer.fit(a).transform(a).show()

+---+--------+------+------+-----------------+-----+-----+----------------+-------+-----+--------+
| Id|Survived|Pclass|   Sex|              Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+---+--------+------+------+-----------------+-----+-----+----------------+-------+-----+--------+
|  1|       0|     3|  male|             22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|  2|       1|     1|female|             38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|  3|       1|     3|female|             26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|  4|       1|     1|female|             35.0|    1|    0|          113803|   53.1| C123|       S|
|  5|       0|     3|  male|             35.0|    0|    0|          373450|   8.05| null|       S|
|  6|       0|     3|  male|29.69911764705882|    0|    0|          330877| 8.4583| null|       Q|
|  7|       0|     1|  male|             54.0|    0|    0|           17463|51.8625|  E46|       S|
|  8|     

In [51]:
# Opções de filtro
# Exibe todas as colunas que atendem a condição do filtro informado

a.filter('Age >= 22').show()

+---+--------+------+------+----+-----+-----+----------------+-------+-----+--------+
| Id|Survived|Pclass|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+---+--------+------+------+----+-----+-----+----------------+-------+-----+--------+
|  1|       0|     3|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|  2|       1|     1|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|  3|       1|     3|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|  4|       1|     1|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|  5|       0|     3|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|  7|       0|     1|  male|54.0|    0|    0|           17463|51.8625|  E46|       S|
|  9|       1|     3|female|27.0|    0|    2|          347742|11.1333| null|       S|
| 12|       1|     1|female|58.0|    0|    0|          113783|  26.55| C103|       S|
| 14|       0|     3|  male|39.0|    1|    5|         

In [52]:
# Opções de filtro
# Exibe somente as colunas informadas dentro do "select" que atendem a condição do filtro informado

a.filter('Age >= 22').select(['Survived', 'Sex']).show()

+--------+------+
|Survived|   Sex|
+--------+------+
|       0|  male|
|       1|female|
|       1|female|
|       1|female|
|       0|  male|
|       0|  male|
|       1|female|
|       1|female|
|       0|  male|
|       1|female|
|       0|female|
|       0|  male|
|       1|  male|
|       1|  male|
|       1|female|
|       0|  male|
|       0|  male|
|       0|  male|
|       0|  male|
|       0|female|
+--------+------+
only showing top 20 rows



In [53]:
# Uma outra opção de informar a condição que deve ser atendida

a.filter(a['Sex'] == 'male').show()

+---+--------+------+----+----+-----+-----+----------+-------+-----------+--------+
| Id|Survived|Pclass| Sex| Age|SibSp|Parch|    Ticket|   Fare|      Cabin|Embarked|
+---+--------+------+----+----+-----+-----+----------+-------+-----------+--------+
|  1|       0|     3|male|22.0|    1|    0| A/5 21171|   7.25|       null|       S|
|  5|       0|     3|male|35.0|    0|    0|    373450|   8.05|       null|       S|
|  6|       0|     3|male|null|    0|    0|    330877| 8.4583|       null|       Q|
|  7|       0|     1|male|54.0|    0|    0|     17463|51.8625|        E46|       S|
|  8|       0|     3|male| 2.0|    3|    1|    349909| 21.075|       null|       S|
| 13|       0|     3|male|20.0|    0|    0| A/5. 2151|   8.05|       null|       S|
| 14|       0|     3|male|39.0|    1|    5|    347082| 31.275|       null|       S|
| 17|       0|     3|male| 2.0|    4|    1|    382652| 29.125|       null|       Q|
| 18|       1|     2|male|null|    0|    0|    244373|   13.0|       null|  

In [56]:
# Cada condição deve ser especificada dentro de parenteses separados (condição E)
a.filter((a['Sex'] == 'male') & (a['Survived'] == 1)).show()

+---+--------+------+----+----+-----+-----+--------+-------+-------+--------+
| Id|Survived|Pclass| Sex| Age|SibSp|Parch|  Ticket|   Fare|  Cabin|Embarked|
+---+--------+------+----+----+-----+-----+--------+-------+-------+--------+
| 18|       1|     2|male|null|    0|    0|  244373|   13.0|   null|       S|
| 22|       1|     2|male|34.0|    0|    0|  248698|   13.0|    D56|       S|
| 24|       1|     1|male|28.0|    0|    0|  113788|   35.5|     A6|       S|
| 37|       1|     3|male|null|    0|    0|    2677| 7.2292|   null|       C|
| 56|       1|     1|male|null|    0|    0|   19947|   35.5|    C52|       S|
| 66|       1|     3|male|null|    1|    1|    2661|15.2458|   null|       C|
| 75|       1|     3|male|32.0|    0|    0|    1601|56.4958|   null|       S|
| 79|       1|     2|male|0.83|    0|    2|  248738|   29.0|   null|       S|
| 82|       1|     3|male|29.0|    0|    0|  345779|    9.5|   null|       S|
| 98|       1|     1|male|23.0|    0|    1|PC 17759|63.3583|D10 

In [57]:
# Cada condição deve ser especificada dentro de parenteses separados (condição OU)
a.filter((a['Sex'] == 'male') | (a['Survived'] == 1)).show()

+---+--------+------+------+----+-----+-----+----------------+-------+-----+--------+
| Id|Survived|Pclass|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+---+--------+------+------+----+-----+-----+----------------+-------+-----+--------+
|  1|       0|     3|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|  2|       1|     1|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|  3|       1|     3|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|  4|       1|     1|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|  5|       0|     3|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|  6|       0|     3|  male|null|    0|    0|          330877| 8.4583| null|       Q|
|  7|       0|     1|  male|54.0|    0|    0|           17463|51.8625|  E46|       S|
|  8|       0|     3|  male| 2.0|    3|    1|          349909| 21.075| null|       S|
|  9|       1|     3|female|27.0|    0|    2|         

In [59]:
# Not condition (mostra a condição contrária. Nesse caso, é equivalente a < 22).
a.filter(~(a['Age'] >= 22)).show()

+---+--------+------+------+----+-----+-----+-------------+-------+-----------+--------+
| Id|Survived|Pclass|   Sex| Age|SibSp|Parch|       Ticket|   Fare|      Cabin|Embarked|
+---+--------+------+------+----+-----+-----+-------------+-------+-----------+--------+
|  8|       0|     3|  male| 2.0|    3|    1|       349909| 21.075|       null|       S|
| 10|       1|     2|female|14.0|    1|    0|       237736|30.0708|       null|       C|
| 11|       1|     3|female| 4.0|    1|    1|      PP 9549|   16.7|         G6|       S|
| 13|       0|     3|  male|20.0|    0|    0|    A/5. 2151|   8.05|       null|       S|
| 15|       0|     3|female|14.0|    0|    0|       350406| 7.8542|       null|       S|
| 17|       0|     3|  male| 2.0|    4|    1|       382652| 29.125|       null|       Q|
| 23|       1|     3|female|15.0|    0|    0|       330923| 8.0292|       null|       Q|
| 25|       0|     3|female| 8.0|    3|    1|       349909| 21.075|       null|       S|
| 28|       0|     1|