<h1>PySpark install and intro</h1>

In [4]:
import pyspark


In [1]:
pip install pyspark



In [1]:
import pyspark

In [2]:
import pandas as pd
df = pd.read_csv('test1.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Name        3 non-null      object
 1   Age         3 non-null      int64 
 2   Experience  3 non-null      int64 
dtypes: int64(2), object(1)
memory usage: 200.0+ bytes


In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('practise').getOrCreate()

In [4]:
spark

In [5]:
df = spark.read.csv('test1.csv')
df

DataFrame[_c0: string, _c1: string, _c2: string]

In [6]:
df = spark.read.option('header', 'true').csv('test1.csv')
df

DataFrame[Name: string, Age: string, Experience: string]

In [8]:
df.show()

+---------+---+----------+
|     Name|Age|Experience|
+---------+---+----------+
|   Naveen| 24|        10|
|  Bhargav| 22|        10|
|Chennamma| 46|        20|
+---------+---+----------+



In [9]:
type(df)

pyspark.sql.dataframe.DataFrame

In [10]:
df.head(3)

[Row(Name='Naveen', Age='24', Experience='10'),
 Row(Name='Bhargav', Age='22', Experience='10'),
 Row(Name='Chennamma', Age='46', Experience='20')]

In [11]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Experience: string (nullable = true)



In [12]:
spark.stop()

<h1>PySpark Dataframes</h1>

In [13]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Dataframe').getOrCreate()

In [14]:
spark

In [15]:
df = spark.read.option('header', 'true').csv('test1.csv')
df.printSchema()
df.head(3)

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Experience: string (nullable = true)



[Row(Name='Naveen', Age='24', Experience='10'),
 Row(Name='Bhargav', Age='22', Experience='10'),
 Row(Name='Chennamma', Age='46', Experience='20')]

In [16]:
df = spark.read.csv('test1.csv', header = True, inferSchema=True)
df.printSchema()
df.head(3)

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



[Row(Name='Naveen', Age=24, Experience=10),
 Row(Name='Bhargav', Age=22, Experience=10),
 Row(Name='Chennamma', Age=46, Experience=20)]

In [17]:
type(df)

pyspark.sql.dataframe.DataFrame

In [18]:
df.columns

['Name', 'Age', 'Experience']

In [19]:
print(df)

DataFrame[Name: string, Age: int, Experience: int]


In [20]:
df.show()

+---------+---+----------+
|     Name|Age|Experience|
+---------+---+----------+
|   Naveen| 24|        10|
|  Bhargav| 22|        10|
|Chennamma| 46|        20|
+---------+---+----------+



In [21]:
df.select(['Name', 'Experience']).show()

+---------+----------+
|     Name|Experience|
+---------+----------+
|   Naveen|        10|
|  Bhargav|        10|
|Chennamma|        20|
+---------+----------+



In [22]:
df.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int')]

In [23]:
df.describe().show()

+-------+-------+------------------+------------------+
|summary|   Name|               Age|        Experience|
+-------+-------+------------------+------------------+
|  count|      3|                 3|                 3|
|   mean|   null|30.666666666666668|13.333333333333334|
| stddev|   null|13.316656236958785| 5.773502691896257|
|    min|Bhargav|                22|                10|
|    max| Naveen|                46|                20|
+-------+-------+------------------+------------------+



In [24]:
#add a column
df = df.withColumn('Experience after 2 years', df['Experience']+2)
df.show()

+---------+---+----------+------------------------+
|     Name|Age|Experience|Experience after 2 years|
+---------+---+----------+------------------------+
|   Naveen| 24|        10|                      12|
|  Bhargav| 22|        10|                      12|
|Chennamma| 46|        20|                      22|
+---------+---+----------+------------------------+



In [25]:
#delete a column
df = df.drop('Experience after 2 years')
df.show()

+---------+---+----------+
|     Name|Age|Experience|
+---------+---+----------+
|   Naveen| 24|        10|
|  Bhargav| 22|        10|
|Chennamma| 46|        20|
+---------+---+----------+



In [26]:
#rename column
df = df.withColumnRenamed('Name', 'New Name')
df.show()

+---------+---+----------+
| New Name|Age|Experience|
+---------+---+----------+
|   Naveen| 24|        10|
|  Bhargav| 22|        10|
|Chennamma| 46|        20|
+---------+---+----------+



In [27]:
df = df.withColumn('Salary', df['Experience']*10).withColumn('Bonus', df['Experience']*10/2)
df.show()

+---------+---+----------+------+-----+
| New Name|Age|Experience|Salary|Bonus|
+---------+---+----------+------+-----+
|   Naveen| 24|        10|   100| 50.0|
|  Bhargav| 22|        10|   100| 50.0|
|Chennamma| 46|        20|   200|100.0|
+---------+---+----------+------+-----+



In [31]:
df = df.select('*', (df.Experience*100).alias('Col1'), (df['New Name']).alias('Full Name' ))
df.show()


+---------+---+----------+------+-----+----+---------+----+---------+----+---------+
| New Name|Age|Experience|Salary|Bonus|Col1|Full Name|Col1|Full Name|Col1|Full Name|
+---------+---+----------+------+-----+----+---------+----+---------+----+---------+
|   Naveen| 24|        10|   100| 50.0|1000|     null|1000|   Naveen|1000|   Naveen|
|  Bhargav| 22|        10|   100| 50.0|1000|     null|1000|  Bhargav|1000|  Bhargav|
|Chennamma| 46|        20|   200|100.0|2000|     null|2000|Chennamma|2000|Chennamma|
+---------+---+----------+------+-----+----+---------+----+---------+----+---------+

