In [0]:
# Importando as bibliotecas que serão usadas no laboratório.

from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *


In [0]:
path = '/FileStore/tables/titanic.csv'
file_type = 'csv'

df_titanic = spark \
             .read.format(file_type) \
             .option("inferSchema" , "True") \
             .option("header" , "True") \
             .option("sep", ",") \
             .csv(path)

In [0]:
#Nome das Colunas
df_titanic.columns

Out[19]: ['PassengerId',
 'Survived',
 'Classe',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [0]:
#Exibindo o dataframe
df_titanic.show(5)
# df_titanic.head(5) - Pandas

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+------

In [0]:
# verificando o tipo de colunas
df_titanic.printSchema()
#df_titanic.info() - pandas

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [0]:
#Renomeando colunas
df_titanic = df_titanic.withColumnRenamed("Pclass" , "Classe")


In [0]:
#Mostrando dataframe com filtro
df_titanic.filter(df_titanic['Survived'] == 1).show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+--------+-----+--------+
|PassengerId|Survived|Classe|                Name|   Sex| Age|SibSp|Parch|          Ticket|    Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+--------+-----+--------+
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599| 71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|   7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|    53.1| C123|       S|
|          9|       1|     3|Johnson, Mrs. Osc...|female|27.0|    0|    2|          347742| 11.1333| null|       S|
|         10|       1|     2|Nasser, Mrs. Nich...|female|14.0|    1|    0|          237736| 30.0708| null|       C|
|         11|       1|     3|Sandstrom, Miss. ...|female| 4.0|    1|    

In [0]:
#Selecionando Colunas
df_titanic.select("Name").show()

+--------------------+
|                Name|
+--------------------+
|Braund, Mr. Owen ...|
|Cumings, Mrs. Joh...|
|Heikkinen, Miss. ...|
|Futrelle, Mrs. Ja...|
|Allen, Mr. Willia...|
|    Moran, Mr. James|
|McCarthy, Mr. Tim...|
|Palsson, Master. ...|
|Johnson, Mrs. Osc...|
|Nasser, Mrs. Nich...|
|Sandstrom, Miss. ...|
|Bonnell, Miss. El...|
|Saundercock, Mr. ...|
|Andersson, Mr. An...|
|Vestrom, Miss. Hu...|
|Hewlett, Mrs. (Ma...|
|Rice, Master. Eugene|
|Williams, Mr. Cha...|
|Vander Planke, Mr...|
|Masselmani, Mrs. ...|
+--------------------+
only showing top 20 rows



In [0]:
#Selecionar Colunas com Alias
df_titanic.select(df_titanic['Name'].alias('Nome')).show()

+--------------------+
|                Nome|
+--------------------+
|Braund, Mr. Owen ...|
|Cumings, Mrs. Joh...|
|Heikkinen, Miss. ...|
|Futrelle, Mrs. Ja...|
|Allen, Mr. Willia...|
|    Moran, Mr. James|
|McCarthy, Mr. Tim...|
|Palsson, Master. ...|
|Johnson, Mrs. Osc...|
|Nasser, Mrs. Nich...|
|Sandstrom, Miss. ...|
|Bonnell, Miss. El...|
|Saundercock, Mr. ...|
|Andersson, Mr. An...|
|Vestrom, Miss. Hu...|
|Hewlett, Mrs. (Ma...|
|Rice, Master. Eugene|
|Williams, Mr. Cha...|
|Vander Planke, Mr...|
|Masselmani, Mrs. ...|
+--------------------+
only showing top 20 rows



In [0]:
#Organizar as colunas
df_titanic.columns

Out[31]: ['PassengerId',
 'Survived',
 'Classe',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [0]:
#Organizar as colunas
df_titanic.select('Name','Survived','Classe',
                  'Sex', 'Age', 'SibSp', 'Parch', 
                  'Ticket', 'Fare', 'Cabin', 'Embarked').show()

+--------------------+--------+------+------+----+-----+-----+----------------+-------+-----+--------+
|                Name|Survived|Classe|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+--------------------+--------+------+------+----+-----+-----+----------------+-------+-----+--------+
|Braund, Mr. Owen ...|       0|     3|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|Cumings, Mrs. Joh...|       1|     1|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|Heikkinen, Miss. ...|       1|     3|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|Futrelle, Mrs. Ja...|       1|     1|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|Allen, Mr. Willia...|       0|     3|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|    Moran, Mr. James|       0|     3|  male|null|    0|    0|          330877| 8.4583| null|       Q|
|McCarthy, Mr. Tim...|       0|     1|  male|54.0|    0|    0|           

In [0]:
#Filtros
df_titanic.filter(df_titanic["Sex"] == "male").show()

+-----------+--------+------+--------------------+----+----+-----+-----+----------+-------+-----------+--------+
|PassengerId|Survived|Classe|                Name| Sex| Age|SibSp|Parch|    Ticket|   Fare|      Cabin|Embarked|
+-----------+--------+------+--------------------+----+----+-----+-----+----------+-------+-----------+--------+
|          1|       0|     3|Braund, Mr. Owen ...|male|22.0|    1|    0| A/5 21171|   7.25|       null|       S|
|          5|       0|     3|Allen, Mr. Willia...|male|35.0|    0|    0|    373450|   8.05|       null|       S|
|          6|       0|     3|    Moran, Mr. James|male|null|    0|    0|    330877| 8.4583|       null|       Q|
|          7|       0|     1|McCarthy, Mr. Tim...|male|54.0|    0|    0|     17463|51.8625|        E46|       S|
|          8|       0|     3|Palsson, Master. ...|male| 2.0|    3|    1|    349909| 21.075|       null|       S|
|         13|       0|     3|Saundercock, Mr. ...|male|20.0|    0|    0| A/5. 2151|   8.05|     

In [0]:
#Filtrar DF com 2 condições
df_titanic.filter((df_titanic["Sex"] == "male") & (df_titanic["Survived"] == "0")).show()

+-----------+--------+------+--------------------+----+----+-----+-----+---------------+-------+-----------+--------+
|PassengerId|Survived|Classe|                Name| Sex| Age|SibSp|Parch|         Ticket|   Fare|      Cabin|Embarked|
+-----------+--------+------+--------------------+----+----+-----+-----+---------------+-------+-----------+--------+
|          1|       0|     3|Braund, Mr. Owen ...|male|22.0|    1|    0|      A/5 21171|   7.25|       null|       S|
|          5|       0|     3|Allen, Mr. Willia...|male|35.0|    0|    0|         373450|   8.05|       null|       S|
|          6|       0|     3|    Moran, Mr. James|male|null|    0|    0|         330877| 8.4583|       null|       Q|
|          7|       0|     1|McCarthy, Mr. Tim...|male|54.0|    0|    0|          17463|51.8625|        E46|       S|
|          8|       0|     3|Palsson, Master. ...|male| 2.0|    3|    1|         349909| 21.075|       null|       S|
|         13|       0|     3|Saundercock, Mr. ...|male|2

In [0]:
#Filtrando algumas colunas com filtro
df_titanic.select("PassengerId","Name","Survived","Sex") \
           .filter((df_titanic["Sex"] == "male") & (df_titanic["Survived"] == "0")).show()

+-----------+--------------------+--------+----+
|PassengerId|                Name|Survived| Sex|
+-----------+--------------------+--------+----+
|          1|Braund, Mr. Owen ...|       0|male|
|          5|Allen, Mr. Willia...|       0|male|
|          6|    Moran, Mr. James|       0|male|
|          7|McCarthy, Mr. Tim...|       0|male|
|          8|Palsson, Master. ...|       0|male|
|         13|Saundercock, Mr. ...|       0|male|
|         14|Andersson, Mr. An...|       0|male|
|         17|Rice, Master. Eugene|       0|male|
|         21|Fynney, Mr. Joseph J|       0|male|
|         27|Emir, Mr. Farred ...|       0|male|
|         28|Fortune, Mr. Char...|       0|male|
|         30| Todoroff, Mr. Lalio|       0|male|
|         31|Uruchurtu, Don. M...|       0|male|
|         34|Wheadon, Mr. Edwa...|       0|male|
|         35|Meyer, Mr. Edgar ...|       0|male|
|         36|Holverson, Mr. Al...|       0|male|
|         38|Cann, Mr. Ernest ...|       0|male|
|         43| Kraeff

In [0]:
#Criar novas colunas (usando a função lit)
df_titanic.withColumn('Nova_Coluna',lit(2018))

[0;31m---------------------------------------------------------------------------[0m
[0;31mTypeError[0m                                 Traceback (most recent call last)
[0;32m<command-323532469132119>[0m in [0;36m<module>[0;34m[0m
[1;32m      1[0m [0;31m#Criar novas colunas (usando a função lit)[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      2[0m [0mdf_titanic[0m[0;34m.[0m[0mwithColumn[0m[0;34m([0m[0;34m'Nova_Coluna'[0m[0;34m,[0m[0mlit[0m[0;34m([0m[0;36m2018[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 3[0;31m [0mdf_titanic[0m[0;34m.[0m[0mwithColumn[0m[0;34m([0m[0;34m'Nova_Coluna2'[0m[0;34m)[0m[0;34m.[0m[0mshow[0m[0;34m([0m[0;36m10[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
[0;31mTypeError[0m: withColumn() missing 1 required positional argument: 'col'

In [0]:
#Criar coluna condicional(usando função substring)
df_titanic.withColumn("Sub",substring('Sex',0,1)).show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+---+
|PassengerId|Survived|Classe|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|Sub|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+---+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|  m|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|  f|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|  f|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|  f|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|  m|
+-----------+--------+------+-------------------

In [0]:
#Criar coluna condicional (Concat /concat_ws)
df_titanic.withColumn("Concat", concat('Survived','Classe')).show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+------+
|PassengerId|Survived|Classe|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|Concat|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|    03|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|    11|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|    13|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|    11|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|    03|
|          6|       0|  

In [0]:
#Criar coluna condicional (Concat /concat_ws)
df_titanic.withColumn("Concat1", concat_ws('-','Survived','Classe')).show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+-------+
|PassengerId|Survived|Classe|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|Concat1|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+-------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|    0-3|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|    1-1|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|    1-3|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|    1-1|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|    0-3|
|          6|   

In [0]:
#Mudando o tipo da coluna de double para int
df_titanic = df_titanic.withColumn('Age',col('Age').cast(StringType()))

In [0]:
df_titanic.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Classe: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [0]:
#Criando função lambda
#Nesta função e possivel chamar pelo nome da variavel e passar um valor(coluna) para transformação
#Mais detalhes : https://www.youtube.com/watch?v=ycZZs4371us  minutos 50:00
fun_age_lambda = udf(lambda age: age.split('.')[0])

In [0]:
#Chamando a função lambda e filtrando o dataset
df_titanic.withColumn('Idade', fun_age_lambda('Age')).filter(df_titanic['Age'] != 'null').show()

+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+-----+
|PassengerId|Survived|Classe|                Name|   Sex|Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|Idade|
+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+-----+
|          1|       0|     3|Braund, Mr. Owen ...|  male| 22|    1|    0|       A/5 21171|   7.25| null|       S|   22|
|          2|       1|     1|Cumings, Mrs. Joh...|female| 38|    1|    0|        PC 17599|71.2833|  C85|       C|   38|
|          3|       1|     3|Heikkinen, Miss. ...|female| 26|    0|    0|STON/O2. 3101282|  7.925| null|       S|   26|
|          4|       1|     1|Futrelle, Mrs. Ja...|female| 35|    1|    0|          113803|   53.1| C123|       S|   35|
|          5|       0|     3|Allen, Mr. Willia...|  male| 35|    0|    0|          373450|   8.05| null|       S|   35|
|          7|       0|     1|McCarthy, M