In [4]:
# Import required libraries
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession
import databricks.koalas as ks

# Object Creation

In [10]:
# Create Pandas Series
pos = pd.Series([1, 2, 3, np.nan, 5, 6])
print('Pandas Series: {}'.format(pos))

Pandas Series: 0    1.0
1    2.0
2    3.0
3    NaN
4    5.0
5    6.0
dtype: float64


In [13]:
# Create Koalas Series
kos = ks.Series([1, 2, 3, np.nan, 5, 6])
print('Koalas Series: {}'.format(kos))

Koalas Series: 0    1.0
1    2.0
2    3.0
3    NaN
4    5.0
5    6.0
Name: 0, dtype: float64


In [16]:
# Create Pandas Dataframe
pdf = pd.DataFrame(
    {'id': [1001, 1002, 1003, 1004, 1005, 1006],
     'Name': ['Arvind', 'John', 'Yogesh', 'Ramesh', 'Kumar', 'Parker'],
     'Designation': ["Developer", "Analyst", "Manager", "Support", "Admin", "Developer"]},
    index=[10, 20, 30, 40, 50, 60])
display(pdf)

Unnamed: 0,id,Name,Designation
10,1001,Arvind,Developer
20,1002,John,Analyst
30,1003,Yogesh,Manager
40,1004,Ramesh,Support
50,1005,Kumar,Admin
60,1006,Parker,Developer


In [21]:
# Create Koalas Dataframe
kdf = ks.DataFrame(
    {'id': [1001, 1002, 1003, 1004, 1005, 1006],
     'Name': ['Arvind', 'John', 'Yogesh', 'Ramesh', 'Kumar', 'Parker'],
     'Designation': ["Developer", "Analyst", "Manager", "Support", "Admin", "Developer"]},
    index=[10, 20, 30, 40, 50, 60])
display(kdf)

Unnamed: 0,id,Name,Designation
10,1001,Arvind,Developer
20,1002,John,Analyst
30,1003,Yogesh,Manager
40,1004,Ramesh,Support
50,1005,Kumar,Admin
60,1006,Parker,Developer


# Data Visualization

In [53]:
kdf = ks.DataFrame(np.random.randn(10, 5), columns=list('ABCDE'))

In [32]:
kdf.index

Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int64')

In [33]:
kdf.head()

Unnamed: 0,A,B,C,D,E
0,1.811715,0.397929,0.569789,1.213236,-1.028035
1,1.908569,0.505519,-1.047089,0.91228,-0.869791
2,-1.43354,-1.341217,-0.235642,0.122826,-1.931499
3,0.283139,0.771255,1.539187,0.407914,-1.182549
4,-0.975535,0.557154,0.714998,-0.31355,-0.286099


In [34]:
kdf.describe()

Unnamed: 0,A,B,C,D,E
count,10.0,10.0,10.0,10.0,10.0
mean,-0.018363,0.238562,-0.033902,0.555922,-0.705352
std,1.214557,0.717819,1.027933,0.610336,0.948985
min,-1.43354,-1.341217,-2.08142,-0.31355,-2.244375
25%,-0.975535,0.003782,-0.563313,0.122826,-1.182549
50%,-0.267302,0.397929,-0.093553,0.5167,-0.869791
75%,0.842637,0.771255,0.691609,0.91228,-0.126875
max,1.908569,1.042174,1.539187,1.69651,0.672736


In [35]:
kdf.columns

Index(['A', 'B', 'C', 'D', 'E'], dtype='object')

In [36]:
kdf.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
A,1.811715,1.908569,-1.43354,0.283139,-0.975535,-0.881947,0.842637,-0.176171,-0.267302,-1.295194
B,0.397929,0.505519,-1.341217,0.771255,0.557154,0.793953,1.042174,0.003782,-0.567744,0.222817
C,0.569789,-1.047089,-0.235642,1.539187,0.714998,0.691609,0.166415,-0.563313,-0.093553,-2.08142
D,1.213236,0.91228,0.122826,0.407914,-0.31355,1.69651,0.5167,0.575101,-0.167701,0.595905
E,-1.028035,-0.869791,-1.931499,-1.182549,-0.286099,-0.552201,0.672736,-0.126875,0.495171,-2.244375


# Data Cleaning

In [69]:
# Create Koalas Dataframe
kdf = ks.DataFrame(
    {'id': [1001, 1002, 1003, 1004, 1005, 1006, 1003],
     'Name': ['Arvind', 'John', 'Yogesh', 'Ramesh', 'Kumar', 'Parker', 'Yogesh'],
     'Designation': ["Developer", "Analyst", "Manager", "Support", "Admin", "Developer", np.nan]},
    index=[0, 1, 2, 3, 4, 5, 6])
display(kdf)

Unnamed: 0,id,Name,Designation
0,1001,Arvind,Developer
1,1002,John,Analyst
2,1003,Yogesh,Manager
3,1004,Ramesh,Support
4,1005,Kumar,Admin
5,1006,Parker,Developer
6,1003,Yogesh,


In [71]:
kdf1 = kdf.dropna(how='any')
display(kdf1)

Unnamed: 0,id,Name,Designation
0,1001,Arvind,Developer
1,1002,John,Analyst
2,1003,Yogesh,Manager
3,1004,Ramesh,Support
4,1005,Kumar,Admin
5,1006,Parker,Developer


In [73]:
kdf2 = kdf.fillna(value='Manager')
display(kdf2)

Unnamed: 0,id,Name,Designation
0,1001,Arvind,Developer
1,1002,John,Analyst
2,1003,Yogesh,Manager
3,1004,Ramesh,Support
4,1005,Kumar,Admin
5,1006,Parker,Developer
6,1003,Yogesh,Manager


# Converting Pandas Dataframe to PySpark Dataframe using Koalas

In [81]:
# Converting Pandas Dataframe to PySpark Dataframe using Koalas

pdf = pd.DataFrame(
    {'id': [1001, 1002, 1003, 1004, 1005, 1006],
     'Name': ['Arvind', 'John', 'Yogesh', 'Ramesh', 'Kumar', 'Parker'],
     'Designation': ["Developer", "Analyst", "Manager", "Support", "Admin", "Developer"]},
    index=[10, 20, 30, 40, 50, 60])
display(pdf)

kdf = ks.from_pandas(pdf)
display(kdf)

sdf = kdf.to_spark()
sdf.show()

Unnamed: 0,id,Name,Designation
10,1001,Arvind,Developer
20,1002,John,Analyst
30,1003,Yogesh,Manager
40,1004,Ramesh,Support
50,1005,Kumar,Admin
60,1006,Parker,Developer


Unnamed: 0,id,Name,Designation
10,1001,Arvind,Developer
20,1002,John,Analyst
30,1003,Yogesh,Manager
40,1004,Ramesh,Support
50,1005,Kumar,Admin
60,1006,Parker,Developer


+----+------+-----------+
|  id|  Name|Designation|
+----+------+-----------+
|1001|Arvind|  Developer|
|1002|  John|    Analyst|
|1003|Yogesh|    Manager|
|1004|Ramesh|    Support|
|1005| Kumar|      Admin|
|1006|Parker|  Developer|
+----+------+-----------+



None