In [71]:
# Importing 
from pyspark.sql import SparkSession
import pyspark.sql.functions as sql_func
from pyspark.ml.feature import Imputer

In [72]:
# Creating session
spark = SparkSession.builder.appName("Dataframe").getOrCreate()

In [73]:
# Reading csv
spark_df = spark.read.csv("./data/sample.csv", header=True, inferSchema=True)

In [74]:
# Column names
spark_df.columns

['Name', 'Age', 'City']

In [75]:
spark_df.show()

+---------------+---+--------------+
|           Name|Age|          City|
+---------------+---+--------------+
|       John Doe| 25|      New York|
|     Jane Smith| 30|   Los Angeles|
|Michael Johnson| 35|       Chicago|
|    Emily Brown| 28| San Francisco|
|       John Doe| 25|      New York|
|     Jane Smith| 30|              |
|Michael Johnson|   |       Chicago|
|           null| 28| San Francisco|
+---------------+---+--------------+



In [76]:
spark_df.head(2)

[Row(Name='John Doe', Age=' 25', City=' New York'),
 Row(Name='Jane Smith', Age=' 30', City=' Los Angeles')]

In [77]:
# info
spark_df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- City: string (nullable = true)



In [78]:
# Describe
spark_df.describe().show()

+-------+---------------+------------------+--------------+
|summary|           Name|               Age|          City|
+-------+---------------+------------------+--------------+
|  count|              7|                 8|             8|
|   mean|           null|28.714285714285715|          null|
| stddev|           null| 3.450327796711771|          null|
|    min|    Emily Brown|                  |              |
|    max|Michael Johnson|                35| San Francisco|
+-------+---------------+------------------+--------------+



In [79]:
# Selecting a column
spark_df.select("Name").show()

+---------------+
|           Name|
+---------------+
|       John Doe|
|     Jane Smith|
|Michael Johnson|
|    Emily Brown|
|       John Doe|
|     Jane Smith|
|Michael Johnson|
|           null|
+---------------+



In [80]:
spark_df.count()

8

In [81]:
# Checking na values

na_counts = spark_df.select([sql_func.sum(sql_func.col(column).isNull().cast("integer")).alias(column) for column in spark_df.columns])
na_counts.show()

+----+---+----+
|Name|Age|City|
+----+---+----+
|   1|  0|   0|
+----+---+----+



In [82]:
# Converting spaces to na
for column in spark_df.columns:
    spark_df = spark_df.withColumn(column, sql_func.when(sql_func.col(column) == " ", None).otherwise(sql_func.col(column)))

In [83]:
spark_df.show()

+---------------+----+--------------+
|           Name| Age|          City|
+---------------+----+--------------+
|       John Doe|  25|      New York|
|     Jane Smith|  30|   Los Angeles|
|Michael Johnson|  35|       Chicago|
|    Emily Brown|  28| San Francisco|
|       John Doe|  25|      New York|
|     Jane Smith|  30|          null|
|Michael Johnson|null|       Chicago|
|           null|  28| San Francisco|
+---------------+----+--------------+



In [84]:
spark_df.select([sql_func.sum(sql_func.col(column).isNull().cast("integer")).alias(column) for column in spark_df.columns]).show()

+----+---+----+
|Name|Age|City|
+----+---+----+
|   1|  1|   1|
+----+---+----+



In [87]:
# Filtering na values
spark_df_notna = spark_df.filter(sql_func.col("Name").isNotNull() & sql_func.col("City").isNotNull())

In [88]:
spark_df_notna.show()

+---------------+----+--------------+
|           Name| Age|          City|
+---------------+----+--------------+
|       John Doe|  25|      New York|
|     Jane Smith|  30|   Los Angeles|
|Michael Johnson|  35|       Chicago|
|    Emily Brown|  28| San Francisco|
|       John Doe|  25|      New York|
|Michael Johnson|null|       Chicago|
+---------------+----+--------------+



In [89]:
spark_df.filter(sql_func.col("Name").isNotNull() & sql_func.col("City").isNotNull()).show()

+---------------+----+--------------+
|           Name| Age|          City|
+---------------+----+--------------+
|       John Doe|  25|      New York|
|     Jane Smith|  30|   Los Angeles|
|Michael Johnson|  35|       Chicago|
|    Emily Brown|  28| San Francisco|
|       John Doe|  25|      New York|
|Michael Johnson|null|       Chicago|
+---------------+----+--------------+



In [90]:
names = spark_df.select("Name")

In [91]:
# Adding a column using other column
spark_df_2 = spark_df.withColumn("Age_1", spark_df["Age"]+2)

In [92]:
spark_df_2.show()

+---------------+----+--------------+-----+
|           Name| Age|          City|Age_1|
+---------------+----+--------------+-----+
|       John Doe|  25|      New York| 27.0|
|     Jane Smith|  30|   Los Angeles| 32.0|
|Michael Johnson|  35|       Chicago| 37.0|
|    Emily Brown|  28| San Francisco| 30.0|
|       John Doe|  25|      New York| 27.0|
|     Jane Smith|  30|          null| 32.0|
|Michael Johnson|null|       Chicago| null|
|           null|  28| San Francisco| 30.0|
+---------------+----+--------------+-----+



In [93]:
spark_df_notna.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- City: string (nullable = true)



In [94]:
# Converting Age to int
spark_df_age_int = spark_df_notna.withColumn("Age", sql_func.col("Age").cast("integer"))
spark_df_age_int.show()

+---------------+----+--------------+
|           Name| Age|          City|
+---------------+----+--------------+
|       John Doe|  25|      New York|
|     Jane Smith|  30|   Los Angeles|
|Michael Johnson|  35|       Chicago|
|    Emily Brown|  28| San Francisco|
|       John Doe|  25|      New York|
|Michael Johnson|null|       Chicago|
+---------------+----+--------------+



In [95]:
# Filling na with mean
imputer = Imputer()
imputer.getStrategy()

'mean'

In [96]:
imputer.setInputCol("Age")
imputer.setOutputCol("Age")

Imputer_ead73a349298

In [102]:
imputer_model = imputer.fit(spark_df_age_int)


In [105]:
spark_df_mean_age = imputer_model.transform(spark_df_age_int).show()

+---------------+---+--------------+
|           Name|Age|          City|
+---------------+---+--------------+
|       John Doe| 25|      New York|
|     Jane Smith| 30|   Los Angeles|
|Michael Johnson| 35|       Chicago|
|    Emily Brown| 28| San Francisco|
|       John Doe| 25|      New York|
|Michael Johnson| 28|       Chicago|
+---------------+---+--------------+



In [98]:
spark_df_age_int.show()

+---------------+----+--------------+
|           Name| Age|          City|
+---------------+----+--------------+
|       John Doe|  25|      New York|
|     Jane Smith|  30|   Los Angeles|
|Michael Johnson|  35|       Chicago|
|    Emily Brown|  28| San Francisco|
|       John Doe|  25|      New York|
|Michael Johnson|null|       Chicago|
+---------------+----+--------------+



In [99]:
name = spark_df.select(["Name"])

In [100]:
# Concatinating string to column
spark_df_2 = spark_df.withColumn("Name_1", sql_func.concat(spark_df["Name"], sql_func.lit("_1")))

In [101]:
spark_df_2.show()

+---------------+----+--------------+-----------------+
|           Name| Age|          City|           Name_1|
+---------------+----+--------------+-----------------+
|       John Doe|  25|      New York|       John Doe_1|
|     Jane Smith|  30|   Los Angeles|     Jane Smith_1|
|Michael Johnson|  35|       Chicago|Michael Johnson_1|
|    Emily Brown|  28| San Francisco|    Emily Brown_1|
|       John Doe|  25|      New York|       John Doe_1|
|     Jane Smith|  30|          null|     Jane Smith_1|
|Michael Johnson|null|       Chicago|Michael Johnson_1|
|           null|  28| San Francisco|             null|
+---------------+----+--------------+-----------------+



In [89]:
spark_df_2.drop("Name_1", "Age_1").show()

+---------------+----+--------------+
|           Name| Age|          City|
+---------------+----+--------------+
|       John Doe|25.0|      New York|
|     Jane Smith|30.0|   Los Angeles|
|Michael Johnson|35.0|       Chicago|
|    Emily Brown|28.0| San Francisco|
+---------------+----+--------------+

