<a href="https://colab.research.google.com/github/JarekMaleszyk/data-science-project-sandbox/blob/main/pyspark_test2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [70]:
try:
  import pyspark
except:
  !pip install pyspark

In [71]:
from pyspark.sql import SparkSession

In [72]:
spark = SparkSession.builder.appName('Pratice').getOrCreate()

In [73]:
spark

In [74]:
## read
df_pyspark = spark.read.option('header','true').csv('/content/my_data/simple_data.csv', sep=";", inferSchema=True)

In [75]:
df_pyspark.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience: integer (nullable = true)
 |-- salary: double (nullable = true)



In [76]:
df_pyspark.columns

['name', 'age', 'experience', 'salary']

In [77]:
print(type(d := df_pyspark.select(['name', 'experience'])))
d.show()

<class 'pyspark.sql.dataframe.DataFrame'>
+-----+----------+
| name|experience|
+-----+----------+
| Kris|         6|
| Adam|         5|
|Sunny|        14|
|  Tom|      NULL|
| John|         3|
| Mark|      NULL|
|  Bob|         7|
|Ellen|         7|
|  Jim|      NULL|
| NULL|        11|
| NULL|        17|
+-----+----------+



In [78]:
df_pyspark['name'] #only Column
df_pyspark.dtypes

[('name', 'string'),
 ('age', 'int'),
 ('experience', 'int'),
 ('salary', 'double')]

In [79]:
df_pyspark.describe().show()

+-------+----+-----------------+-----------------+------------------+
|summary|name|              age|       experience|            salary|
+-------+----+-----------------+-----------------+------------------+
|  count|   9|               10|                8|                10|
|   mean|NULL|             35.0|             8.75|2373.4710000000005|
| stddev|NULL|4.189935029992179|4.803272693844122| 833.8184660204321|
|    min|Adam|               28|                3|           1230.99|
|    max| Tom|               40|               17|           4280.45|
+-------+----+-----------------+-----------------+------------------+



In [80]:
#add column
df_pyspark = df_pyspark.withColumn('monthly_experience', df_pyspark['experience']*12) # Use withColumn instead of WithColumn
df_pyspark.show()

+-----+----+----------+-------+------------------+
| name| age|experience| salary|monthly_experience|
+-----+----+----------+-------+------------------+
| Kris|  31|         6| 2230.3|                72|
| Adam|  30|         5|2230.89|                60|
|Sunny|  39|        14|3230.21|               168|
|  Tom|  28|      NULL|2230.32|              NULL|
| John|  40|         3|   NULL|                36|
| Mark|  36|      NULL|1730.04|              NULL|
|  Bob|  35|         7|2130.05|                84|
|Ellen|  34|         7|2230.96|                84|
|  Jim|NULL|      NULL|1230.99|              NULL|
| NULL|  39|        11| 2210.5|               132|
| NULL|  38|        17|4280.45|               204|
+-----+----+----------+-------+------------------+



In [81]:
#drop column
df_pyspark = df_pyspark.drop('monthly_experience')
df_pyspark.show()

+-----+----+----------+-------+
| name| age|experience| salary|
+-----+----+----------+-------+
| Kris|  31|         6| 2230.3|
| Adam|  30|         5|2230.89|
|Sunny|  39|        14|3230.21|
|  Tom|  28|      NULL|2230.32|
| John|  40|         3|   NULL|
| Mark|  36|      NULL|1730.04|
|  Bob|  35|         7|2130.05|
|Ellen|  34|         7|2230.96|
|  Jim|NULL|      NULL|1230.99|
| NULL|  39|        11| 2210.5|
| NULL|  38|        17|4280.45|
+-----+----+----------+-------+



In [82]:
#rename column
df_pyspark = df_pyspark.withColumnRenamed('name', 'first_name')
df_pyspark.show()

+----------+----+----------+-------+
|first_name| age|experience| salary|
+----------+----+----------+-------+
|      Kris|  31|         6| 2230.3|
|      Adam|  30|         5|2230.89|
|     Sunny|  39|        14|3230.21|
|       Tom|  28|      NULL|2230.32|
|      John|  40|         3|   NULL|
|      Mark|  36|      NULL|1730.04|
|       Bob|  35|         7|2130.05|
|     Ellen|  34|         7|2230.96|
|       Jim|NULL|      NULL|1230.99|
|      NULL|  39|        11| 2210.5|
|      NULL|  38|        17|4280.45|
+----------+----+----------+-------+



In [83]:
df_pyspark.na.drop(how='any').show() #drop any null value rows

+----------+---+----------+-------+
|first_name|age|experience| salary|
+----------+---+----------+-------+
|      Kris| 31|         6| 2230.3|
|      Adam| 30|         5|2230.89|
|     Sunny| 39|        14|3230.21|
|       Bob| 35|         7|2130.05|
|     Ellen| 34|         7|2230.96|
+----------+---+----------+-------+



In [82]:
df_pyspark.na.drop(how='all').show() #drop all null value rows

In [91]:
df_pyspark.na.drop(how='any', thresh=3).show() #drop any row with less than 3 non-null values

+----------+---+----------+-------+
|first_name|age|experience| salary|
+----------+---+----------+-------+
|      Kris| 31|         6| 2230.3|
|      Adam| 30|         5|2230.89|
|     Sunny| 39|        14|3230.21|
|       Tom| 28|      NULL|2230.32|
|      John| 40|         3|   NULL|
|      Mark| 36|      NULL|1730.04|
|       Bob| 35|         7|2130.05|
|     Ellen| 34|         7|2230.96|
|      NULL| 39|        11| 2210.5|
|      NULL| 38|        17|4280.45|
+----------+---+----------+-------+



In [92]:
df_pyspark.na.drop(how='any', subset=['experience']).show() #drop any row with null value in experience column

+----------+---+----------+-------+
|first_name|age|experience| salary|
+----------+---+----------+-------+
|      Kris| 31|         6| 2230.3|
|      Adam| 30|         5|2230.89|
|     Sunny| 39|        14|3230.21|
|      John| 40|         3|   NULL|
|       Bob| 35|         7|2130.05|
|     Ellen| 34|         7|2230.96|
|      NULL| 39|        11| 2210.5|
|      NULL| 38|        17|4280.45|
+----------+---+----------+-------+



In [98]:
## filling the missing values
df_pyspark.na.fill(0, ['age', 'experience']).show()

+----------+---+----------+-------+
|first_name|age|experience| salary|
+----------+---+----------+-------+
|      Kris| 31|         6| 2230.3|
|      Adam| 30|         5|2230.89|
|     Sunny| 39|        14|3230.21|
|       Tom| 28|         0|2230.32|
|      John| 40|         3|   NULL|
|      Mark| 36|         0|1730.04|
|       Bob| 35|         7|2130.05|
|     Ellen| 34|         7|2230.96|
|       Jim|  0|         0|1230.99|
|      NULL| 39|        11| 2210.5|
|      NULL| 38|        17|4280.45|
+----------+---+----------+-------+



In [96]:
df_pyspark.na.fill('unknown', ['first_name']).show()

+----------+----+----------+-------+
|first_name| age|experience| salary|
+----------+----+----------+-------+
|      Kris|  31|         6| 2230.3|
|      Adam|  30|         5|2230.89|
|     Sunny|  39|        14|3230.21|
|       Tom|  28|      NULL|2230.32|
|      John|  40|         3|   NULL|
|      Mark|  36|      NULL|1730.04|
|       Bob|  35|         7|2130.05|
|     Ellen|  34|         7|2230.96|
|       Jim|NULL|      NULL|1230.99|
|   unknown|  39|        11| 2210.5|
|   unknown|  38|        17|4280.45|
+----------+----+----------+-------+



In [104]:
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols=['age', 'salary'],
    outputCols=[f'{c}_imputed' for c in ['age', 'salary']]
).setStrategy('median') # mode / mean / median

imputer.fit(df_pyspark).transform(df_pyspark).show()

+----------+----+----------+-------+-----------+--------------+
|first_name| age|experience| salary|age_imputed|salary_imputed|
+----------+----+----------+-------+-----------+--------------+
|      Kris|  31|         6| 2230.3|         31|        2230.3|
|      Adam|  30|         5|2230.89|         30|       2230.89|
|     Sunny|  39|        14|3230.21|         39|       3230.21|
|       Tom|  28|      NULL|2230.32|         28|       2230.32|
|      John|  40|         3|   NULL|         40|        2230.3|
|      Mark|  36|      NULL|1730.04|         36|       1730.04|
|       Bob|  35|         7|2130.05|         35|       2130.05|
|     Ellen|  34|         7|2230.96|         34|       2230.96|
|       Jim|NULL|      NULL|1230.99|         35|       1230.99|
|      NULL|  39|        11| 2210.5|         39|        2210.5|
|      NULL|  38|        17|4280.45|         38|       4280.45|
+----------+----+----------+-------+-----------+--------------+

