In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("practice").getOrCreate()

In [4]:
pyspark_df = spark.read.csv("test.csv", header=True, inferSchema=True)

In [5]:
pyspark_df.show()

+-------+----+----+------+
|   name| age| exp|salary|
+-------+----+----+------+
|karthik|  20|   1| 10000|
|  gokul|  35|  10|  NULL|
|   subu|  25|   4| 20000|
|   ram |NULL|   1| 10000|
| muthu |  45|  20| 80000|
|kishore|  32|  10| 25000|
|   john|  27|NULL| 15000|
|aravind|  34|  10| 25000|
|   arun|  26|   5| 10000|
+-------+----+----+------+



In [6]:
type(pyspark_df)

pyspark.sql.dataframe.DataFrame

In [7]:
pyspark_df.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- exp: integer (nullable = true)
 |-- salary: integer (nullable = true)



In [8]:
pyspark_df.columns

['name', 'age', 'exp', 'salary']

In [9]:
pyspark_df.select('name').show()

+-------+
|   name|
+-------+
|karthik|
|  gokul|
|   subu|
|   ram |
| muthu |
|kishore|
|   john|
|aravind|
|   arun|
+-------+



In [10]:
pyspark_df.select('name', 'age').show()

+-------+----+
|   name| age|
+-------+----+
|karthik|  20|
|  gokul|  35|
|   subu|  25|
|   ram |NULL|
| muthu |  45|
|kishore|  32|
|   john|  27|
|aravind|  34|
|   arun|  26|
+-------+----+



In [11]:
pyspark_df.dtypes

[('name', 'string'), ('age', 'int'), ('exp', 'int'), ('salary', 'int')]

In [12]:
pyspark_df.describe().show()

+-------+-------+-----------------+-----------------+-----------------+
|summary|   name|              age|              exp|           salary|
+-------+-------+-----------------+-----------------+-----------------+
|  count|      9|                8|                8|                8|
|   mean|   NULL|             30.5|            7.625|          24375.0|
| stddev|   NULL|7.727501906456298|6.300510183423925|23366.26078038895|
|    min|aravind|               20|                1|            10000|
|    max|   subu|               45|               20|            80000|
+-------+-------+-----------------+-----------------+-----------------+



In [13]:
pyspark_df = pyspark_df.withColumn('Age after 2 yrs',pyspark_df['age']+2)

In [14]:
pyspark_df.show()

+-------+----+----+------+---------------+
|   name| age| exp|salary|Age after 2 yrs|
+-------+----+----+------+---------------+
|karthik|  20|   1| 10000|             22|
|  gokul|  35|  10|  NULL|             37|
|   subu|  25|   4| 20000|             27|
|   ram |NULL|   1| 10000|           NULL|
| muthu |  45|  20| 80000|             47|
|kishore|  32|  10| 25000|             34|
|   john|  27|NULL| 15000|             29|
|aravind|  34|  10| 25000|             36|
|   arun|  26|   5| 10000|             28|
+-------+----+----+------+---------------+



In [15]:
pyspark_df = pyspark_df.drop('Age after 2 yrs')

In [16]:
pyspark_df.show()

+-------+----+----+------+
|   name| age| exp|salary|
+-------+----+----+------+
|karthik|  20|   1| 10000|
|  gokul|  35|  10|  NULL|
|   subu|  25|   4| 20000|
|   ram |NULL|   1| 10000|
| muthu |  45|  20| 80000|
|kishore|  32|  10| 25000|
|   john|  27|NULL| 15000|
|aravind|  34|  10| 25000|
|   arun|  26|   5| 10000|
+-------+----+----+------+



In [17]:
pyspark_df.withColumnRenamed('name', 'full name').show()

+---------+----+----+------+
|full name| age| exp|salary|
+---------+----+----+------+
|  karthik|  20|   1| 10000|
|    gokul|  35|  10|  NULL|
|     subu|  25|   4| 20000|
|     ram |NULL|   1| 10000|
|   muthu |  45|  20| 80000|
|  kishore|  32|  10| 25000|
|     john|  27|NULL| 15000|
|  aravind|  34|  10| 25000|
|     arun|  26|   5| 10000|
+---------+----+----+------+



In [18]:
pyspark_df.show()

+-------+----+----+------+
|   name| age| exp|salary|
+-------+----+----+------+
|karthik|  20|   1| 10000|
|  gokul|  35|  10|  NULL|
|   subu|  25|   4| 20000|
|   ram |NULL|   1| 10000|
| muthu |  45|  20| 80000|
|kishore|  32|  10| 25000|
|   john|  27|NULL| 15000|
|aravind|  34|  10| 25000|
|   arun|  26|   5| 10000|
+-------+----+----+------+



In [19]:
df = pyspark_df

In [20]:
df.na.drop().show()

+-------+---+---+------+
|   name|age|exp|salary|
+-------+---+---+------+
|karthik| 20|  1| 10000|
|   subu| 25|  4| 20000|
| muthu | 45| 20| 80000|
|kishore| 32| 10| 25000|
|aravind| 34| 10| 25000|
|   arun| 26|  5| 10000|
+-------+---+---+------+



In [21]:
df.select('exp').describe()

DataFrame[summary: string, exp: string]

In [26]:
dataList = [("Java", 20000), ("Python", 100000), ("Scala", 3000)]
rdd=spark.sparkContext.parallelize(dataList)
print(rdd)

ParallelCollectionRDD[54] at readRDDFromFile at PythonRDD.scala:289
