In [43]:
pip install pyspark




In [44]:
import pyspark

In [45]:
import pandas as pd
pd.read_csv('Test.csv')

Unnamed: 0,Name,Salary
0,Himanshu,100000
1,Shekhar,60000
2,Om,100000000


In [46]:
from pyspark.sql import SparkSession

In [47]:
spark = SparkSession.builder.appName('Practice').getOrCreate()

In [48]:
df_pyspark = spark.read.csv('Test.csv')

In [49]:
df_pyspark

DataFrame[_c0: string, _c1: string]

In [50]:
df_pyspark.show()

+--------+---------+
|     _c0|      _c1|
+--------+---------+
|    Name|   Salary|
|Himanshu|   100000|
| Shekhar|    60000|
|      Om|100000000|
+--------+---------+



In [51]:
df_pyspark = spark.read.option('header','true').csv('Test.csv')

In [52]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [53]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Salary: string (nullable = true)



# Pyspark Dataframes
#### Reading the Dataset
#### Checking the datatypes of the columns(Schema)
#### Selecting Columns And Indexing
#### Check Describe Option similar to Pandas
#### Adding Columns
#### Dropping Columns


In [54]:
from pyspark.sql import SparkSession

In [55]:
spark = SparkSession.builder.appName('PracticeDataframe').getOrCreate()

In [56]:
spark

In [57]:
 ## read the dataset
df_spark = spark.read.option('header', 'true').csv('Test.csv', inferSchema=True)

In [58]:
##Check the schema
df_spark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Salary: integer (nullable = true)



In [59]:
type(df_spark)

pyspark.sql.dataframe.DataFrame

In [60]:
df_spark.columns

['Name', 'Salary']

In [61]:
df_spark.head(3)[2]

Row(Name='Om', Salary=100000000)

In [62]:
df_spark.select('Name', 'Salary').show()

+--------+---------+
|    Name|   Salary|
+--------+---------+
|Himanshu|   100000|
| Shekhar|    60000|
|      Om|100000000|
+--------+---------+



In [63]:
df_spark.dtypes

[('Name', 'string'), ('Salary', 'int')]

In [64]:
df_spark.describe().show()

+-------+--------+--------------------+
|summary|    Name|              Salary|
+-------+--------+--------------------+
|  count|       3|                   3|
|   mean|    NULL|3.3386666666666668E7|
| stddev|    NULL| 5.768884236430241E7|
|    min|Himanshu|               60000|
|    max| Shekhar|           100000000|
+-------+--------+--------------------+



In [65]:
#Adding columns in pyspark data frame
df_spark = df_spark.withColumn('Salary after 2 Years', df_spark['Salary']+10000)

In [66]:
df_spark.show()

+--------+---------+--------------------+
|    Name|   Salary|Salary after 2 Years|
+--------+---------+--------------------+
|Himanshu|   100000|              110000|
| Shekhar|    60000|               70000|
|      Om|100000000|           100010000|
+--------+---------+--------------------+



In [67]:
#Drop the Columns
df_spark = df_spark.drop('Salary after 2 Years')

In [68]:
df_spark.show()

+--------+---------+
|    Name|   Salary|
+--------+---------+
|Himanshu|   100000|
| Shekhar|    60000|
|      Om|100000000|
+--------+---------+



In [69]:
 #Renaming Columns

df_spark.withColumnRenamed('Name', 'Name_1').show()

+--------+---------+
|  Name_1|   Salary|
+--------+---------+
|Himanshu|   100000|
| Shekhar|    60000|
|      Om|100000000|
+--------+---------+



In [70]:
#Handling Missing Values

In [72]:
df_pyspark_mv = spark.read.csv('Practice_MissingValues.csv', header = True, inferSchema= True)

In [73]:
df_pyspark_mv.show()

+--------+-------+----------+----+
|    Name| Salary|Experience| Age|
+--------+-------+----------+----+
|Himanshu| 100000|         3|  25|
| Shekhar|  60000|         2|  29|
|      Om|   NULL|         7|  33|
|    Himm| 105000|         5|  12|
|  Sumeet|  87000|         3|NULL|
|   Manoj|1223112|         9|  42|
+--------+-------+----------+----+



In [75]:
df_pyspark_mv.drop('Age').show()

+--------+-------+----------+
|    Name| Salary|Experience|
+--------+-------+----------+
|Himanshu| 100000|         3|
| Shekhar|  60000|         2|
|      Om|   NULL|         7|
|    Himm| 105000|         5|
|  Sumeet|  87000|         3|
|   Manoj|1223112|         9|
+--------+-------+----------+



In [76]:
df_pyspark_mv.show()

+--------+-------+----------+----+
|    Name| Salary|Experience| Age|
+--------+-------+----------+----+
|Himanshu| 100000|         3|  25|
| Shekhar|  60000|         2|  29|
|      Om|   NULL|         7|  33|
|    Himm| 105000|         5|  12|
|  Sumeet|  87000|         3|NULL|
|   Manoj|1223112|         9|  42|
+--------+-------+----------+----+



In [77]:
df_pyspark_mv.na.drop().show()

+--------+-------+----------+---+
|    Name| Salary|Experience|Age|
+--------+-------+----------+---+
|Himanshu| 100000|         3| 25|
| Shekhar|  60000|         2| 29|
|    Himm| 105000|         5| 12|
|   Manoj|1223112|         9| 42|
+--------+-------+----------+---+



In [79]:
df_pyspark_mv.na.drop(how="any", thresh=1, subset=['Age']).show()

+--------+-------+----------+---+
|    Name| Salary|Experience|Age|
+--------+-------+----------+---+
|Himanshu| 100000|         3| 25|
| Shekhar|  60000|         2| 29|
|      Om|   NULL|         7| 33|
|    Himm| 105000|         5| 12|
|   Manoj|1223112|         9| 42|
+--------+-------+----------+---+



In [80]:
#Filling the missing Value: