In [1]:
from datafusion import SessionContext

In [2]:
ctx = SessionContext()

## Create a DataFrame

In [3]:
df = ctx.from_pydict({"a": [1, 2, 3, 1], "b": [4, 5, 6, 7]}, name="my_table")

In [4]:
ctx.sql("select * from my_table")

a,b
1,4
2,5
3,6
1,7


In [5]:
df.limit(2)

a,b
1,4
2,5


## Query the DataFrame with SQL

In [6]:
ctx.sql("select a, b, a + b as sum_a_b from my_table")

a,b,sum_a_b
1,4,5
2,5,7
3,6,9
1,7,8


## Query the DataFrame with Python

In [7]:
from datafusion import col

In [9]:
df.select(
    col("a"),
    col("b"),
    (col("a") + col("b")).alias("sum_a_b"),
)

a,b,sum_a_b
1,4,5
2,5,7
3,6,9
1,7,8


## Cleanup

In [26]:
ctx.sql("drop table my_table")

DataFrame()
++
++

In [10]:
import datafusion
from datafusion import col
import pyarrow

# create a context
ctx = datafusion.SessionContext()

# create a RecordBatch and a new DataFrame from it
batch = pyarrow.RecordBatch.from_arrays(
    [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])],
    names=["a", "b"],
)
df = ctx.create_dataframe([[batch]])

# create a new statement
df = df.select(
    col("a") + col("b"),
    col("a") - col("b"),
)

# execute and collect the first (and only) batch
result = df.collect()[0]

In [11]:
result

pyarrow.RecordBatch
c6bd0a833d6fe4973b5094552d716b481.a + c6bd0a833d6fe4973b5094552d716b481.b: int64
c6bd0a833d6fe4973b5094552d716b481.a - c6bd0a833d6fe4973b5094552d716b481.b: int64
----
c6bd0a833d6fe4973b5094552d716b481.a + c6bd0a833d6fe4973b5094552d716b481.b: [5,7,9]
c6bd0a833d6fe4973b5094552d716b481.a - c6bd0a833d6fe4973b5094552d716b481.b: [-3,-3,-3]

## Basic operations

In [12]:
from datafusion import SessionContext

import random

ctx = SessionContext()

df = ctx.from_pydict({
    "nrs": [1, 2, 3, 4, 5],
    "names": ["python", "ruby", "java", "haskell", "go"],
    "random": random.sample(range(1000), 5),
    "groups": ["A", "A", "B", "C", "B"],
})

In [13]:
df

nrs,names,random,groups
1,python,442,A
2,ruby,348,A
3,java,622,B
4,haskell,0,C
5,go,123,B


In [14]:
df.limit(2)

nrs,names,random,groups
1,python,442,A
2,ruby,348,A


In [15]:
df.schema()

nrs: int64
names: string
random: int64
groups: string

In [16]:
df.to_pandas()

Unnamed: 0,nrs,names,random,groups
0,1,python,442,A
1,2,ruby,348,A
2,3,java,622,B
3,4,haskell,0,C
4,5,go,123,B


In [17]:
df.describe()

describe,nrs,names,random,groups
count,5.0,5,5.0,5
null_count,0.0,0,0.0,0
mean,3.0,,307.0,
std,1.5811388300841898,,248.6141588888292,
min,1.0,go,0.0,A
max,5.0,ruby,622.0,C
median,3.0,,348.0,


## Column selections

In [18]:
import urllib.request

from datafusion import SessionContext

urllib.request.urlretrieve(
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet",
    "yellow_trip_data.parquet"
)

ctx = SessionContext()

df = ctx.read_parquet("yellow_trip_data.parquet")

In [19]:
df.select_columns("trip_distance", "passenger_count")

trip_distance,passenger_count
2.1,1.0
0.2,1.0
14.7,1.0
10.6,0.0
4.94,1.0
1.6,1.0
4.1,1.0
5.7,1.0
9.1,1.0
2.7,2.0


In [20]:
from datafusion import col, lit

In [21]:
df.select((col("tip_amount") + col("tolls_amount")).alias("tips_plus_tolls"))

tips_plus_tolls
0.0
0.0
8.65
6.05
4.06
2.35
0.0
0.0
0.0
3.15


In [22]:
df.select(col('"VendorID"'))

VendorID
1
1
1
1
2
1
1
1
1
1


In [23]:
large_trip_distance = col("trip_distance") > lit(5.0)

In [24]:
low_passenger_count = col("passenger_count") < lit(4)

In [25]:
df.select((large_trip_distance & low_passenger_count).alias("lonely_trips"))

lonely_trips
False
False
True
True
False
False
False
True
True
False
