In [1]:
# импортируем polars
import polars as pl
# импортируем класс datetime из модуля datetime
from datetime import datetime
# импортируем функцию display()
from IPython.display import display

In [2]:
# создаем датафрейм
df = pl.DataFrame(
    {
        'id': [1, 2, 3],
        'color': ['blue', 'red', 'green'],
        'size': ['small', 'medium', 'large'],
    }
)
print(df)

shape: (3, 3)
┌─────┬───────┬────────┐
│ id  ┆ color ┆ size   │
│ --- ┆ ---   ┆ ---    │
│ i64 ┆ str   ┆ str    │
╞═════╪═══════╪════════╡
│ 1   ┆ blue  ┆ small  │
├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 2   ┆ red   ┆ medium │
├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 3   ┆ green ┆ large  │
└─────┴───────┴────────┘


In [3]:
# отбираем первые две строки
filter_df = df.filter(pl.col('id') <= 2)
print(filter_df)

shape: (2, 3)
┌─────┬───────┬────────┐
│ id  ┆ color ┆ size   │
│ --- ┆ ---   ┆ ---    │
│ i64 ┆ str   ┆ str    │
╞═════╪═══════╪════════╡
│ 1   ┆ blue  ┆ small  │
├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 2   ┆ red   ┆ medium │
└─────┴───────┴────────┘


In [4]:
# отбираем строки, в которых id меньше 
# или равен 0 и size равен small
multi_filter_df = df.filter((pl.col('id') <= 2) 
                            & (pl.col('size') == 'small'))
print(multi_filter_df)

shape: (1, 3)
┌─────┬───────┬───────┐
│ id  ┆ color ┆ size  │
│ --- ┆ ---   ┆ ---   │
│ i64 ┆ str   ┆ str   │
╞═════╪═══════╪═══════╡
│ 1   ┆ blue  ┆ small │
└─────┴───────┴───────┘


In [5]:
# отбираем один столбец
single_select_df = df.select('id')
print(single_select_df)

shape: (3, 1)
┌─────┐
│ id  │
│ --- │
│ i64 │
╞═════╡
│ 1   │
├╌╌╌╌╌┤
│ 2   │
├╌╌╌╌╌┤
│ 3   │
└─────┘


In [6]:
# отбираем столбцы по списку
list_select_df = df.select(['id', 'color'])
print(list_select_df)

shape: (3, 2)
┌─────┬───────┐
│ id  ┆ color │
│ --- ┆ ---   │
│ i64 ┆ str   │
╞═════╪═══════╡
│ 1   ┆ blue  │
├╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2   ┆ red   │
├╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 3   ┆ green │
└─────┴───────┘


In [7]:
# отбираем с помощью выражения - условия
condition_select_df = df.select(pl.col('^col.*$'))
print(condition_select_df)

shape: (3, 1)
┌───────┐
│ color │
│ ---   │
│ str   │
╞═══════╡
│ blue  │
├╌╌╌╌╌╌╌┤
│ red   │
├╌╌╌╌╌╌╌┤
│ green │
└───────┘


In [8]:
# отберем столбцы по типу
dtype_select_df = df.select(pl.col(pl.Int64))
print(dtype_select_df)

shape: (3, 1)
┌─────┐
│ id  │
│ --- │
│ i64 │
╞═════╡
│ 1   │
├╌╌╌╌╌┤
│ 2   │
├╌╌╌╌╌┤
│ 3   │
└─────┘


In [9]:
# отбираем строки и столбцы
expression_df = df.filter(pl.col('id') <= 2).select(['id', 'color'])
print(expression_df)

shape: (2, 2)
┌─────┬───────┐
│ id  ┆ color │
│ --- ┆ ---   │
│ i64 ┆ str   │
╞═════╪═══════╡
│ 1   ┆ blue  │
├╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2   ┆ red   │
└─────┴───────┘


In [10]:
# загружаем данные
df = pl.read_csv('Data/applestock.csv', parse_dates=True)
print(df)

shape: (8933, 7)
┌────────────┬───────────┬───────────┬───────────┬───────────┬───────────┬───────────┐
│ Date       ┆ Open      ┆ High      ┆ Low       ┆ Close     ┆ Volume    ┆ Adj Close │
│ ---        ┆ ---       ┆ ---       ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│ date       ┆ f64       ┆ f64       ┆ f64       ┆ f64       ┆ i64       ┆ f64       │
╞════════════╪═══════════╪═══════════╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 2016-05-16 ┆ 92.389999 ┆ 94.389999 ┆ 91.650002 ┆ 93.879997 ┆ 61140600  ┆ 93.879997 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 2016-05-13 ┆ 90.0      ┆ 91.669998 ┆ 90.0      ┆ 90.519997 ┆ 44188200  ┆ 90.519997 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 2016-05-12 ┆ 92.720001 ┆ 92.779999 ┆ 89.470001 ┆ 90.339996 ┆ 76109800  ┆ 90.339996 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 2016-05-11 ┆ 93.480003 ┆

In [11]:
# загружаем данные, даты обрабатываются как строки
df = pl.read_csv('Data/applestock.csv', parse_dates=False)
# переводим в тип Date
df = df.with_column(pl.col('Date').str.strptime(
    pl.Date, fmt='%Y-%m-%d'))
print(df)

shape: (8933, 7)
┌────────────┬───────────┬───────────┬───────────┬───────────┬───────────┬───────────┐
│ Date       ┆ Open      ┆ High      ┆ Low       ┆ Close     ┆ Volume    ┆ Adj Close │
│ ---        ┆ ---       ┆ ---       ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│ date       ┆ f64       ┆ f64       ┆ f64       ┆ f64       ┆ i64       ┆ f64       │
╞════════════╪═══════════╪═══════════╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 2016-05-16 ┆ 92.389999 ┆ 94.389999 ┆ 91.650002 ┆ 93.879997 ┆ 61140600  ┆ 93.879997 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 2016-05-13 ┆ 90.0      ┆ 91.669998 ┆ 90.0      ┆ 90.519997 ┆ 44188200  ┆ 90.519997 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 2016-05-12 ┆ 92.720001 ┆ 92.779999 ┆ 89.470001 ┆ 90.339996 ┆ 76109800  ┆ 90.339996 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 2016-05-11 ┆ 93.480003 ┆

In [12]:
# создаем календарные признаки
df = df.with_columns([
    (pl.col('Date').dt.year()).alias('year'),
    (pl.col('Date').dt.quarter()).alias('quarter'),
    (pl.col('Date').dt.month()).alias('month'),
    (pl.col('Date').dt.week()).alias('week'),
    (pl.col('Date').dt.weekday()).alias('weekday'),
    (pl.col('Date').dt.day()).alias('dayofmonth'),
    (pl.col('Date').dt.ordinal_day().alias('dayofyear'))
])
df = df.with_column(
    (pl.when(pl.col('weekday') > 5).then(1).otherwise(0)).alias('weekend'))
df

Date,Open,High,Low,Close,Volume,Adj Close,year,quarter,month,week,weekday,dayofmonth,dayofyear,weekend
date,f64,f64,f64,f64,i64,f64,i32,u32,u32,u32,u32,u32,u32,i64
2016-05-16,92.389999,94.389999,91.650002,93.879997,61140600,93.879997,2016,2,5,20,0,16,137,0
2016-05-13,90.0,91.669998,90.0,90.519997,44188200,90.519997,2016,2,5,19,4,13,134,0
2016-05-12,92.720001,92.779999,89.470001,90.339996,76109800,90.339996,2016,2,5,19,3,12,133,0
2016-05-11,93.480003,93.57,92.459999,92.510002,28539900,92.510002,2016,2,5,19,2,11,132,0
2016-05-10,93.330002,93.57,92.110001,93.419998,33592500,93.419998,2016,2,5,19,1,10,131,0
2016-05-09,93.0,93.769997,92.589996,92.790001,32855300,92.790001,2016,2,5,19,0,9,130,0
2016-05-06,93.370003,93.449997,91.849998,92.720001,43458200,92.720001,2016,2,5,18,4,6,127,0
2016-05-05,94.0,94.07,92.68,93.239998,35890500,93.239998,2016,2,5,18,3,5,126,0
2016-05-04,95.199997,95.900002,93.82,94.190002,41025500,93.620002,2016,2,5,18,2,4,125,0
2016-05-03,94.199997,95.739998,93.68,95.18,56831300,94.604009,2016,2,5,18,1,3,124,0


In [13]:
# отбираем определенный диапазон дат
filtered_range_df = df.filter(pl.col('Date').is_between(
    datetime(1995, 7, 1), datetime(1995, 11, 1)))
filtered_range_df

Date,Open,High,Low,Close,Volume,Adj Close,year,quarter,month,week,weekday,dayofmonth,dayofyear,weekend
date,f64,f64,f64,f64,i64,f64,i32,u32,u32,u32,u32,u32,u32,i64
1995-10-31,35.249999,36.625,35.125,36.3125,72304400,1.190041,1995,4,10,44,1,31,304,0
1995-10-30,34.875,35.249999,34.625001,35.249999,43909600,1.155221,1995,4,10,44,0,30,303,0
1995-10-27,34.875,34.875,34.125,34.750001,38553200,1.138835,1995,4,10,43,4,27,300,0
1995-10-26,34.875,35.0,34.499999,34.875,31466400,1.142931,1995,4,10,43,3,26,299,0
1995-10-25,35.249999,35.374999,34.750001,34.750001,33325600,1.138835,1995,4,10,43,2,25,298,0
1995-10-24,35.500001,35.500001,34.875,35.125,53373600,1.151124,1995,4,10,43,1,24,297,0
1995-10-23,35.125,35.125,34.750001,35.125,49450800,1.151124,1995,4,10,43,0,23,296,0
1995-10-20,35.249999,35.249999,34.625001,35.125,96583200,1.151124,1995,4,10,42,4,20,293,0
1995-10-19,35.875,36.124999,34.750001,34.750001,236224800,1.138835,1995,4,10,42,3,19,292,0
1995-10-18,36.999999,39.562501,36.75,37.375001,128100000,1.224862,1995,4,10,42,2,18,291,0


In [14]:
# отберем определенную дату
filtered_df = df.filter(pl.col('Date') == datetime(1995, 10, 16))
filtered_df

Date,Open,High,Low,Close,Volume,Adj Close,year,quarter,month,week,weekday,dayofmonth,dayofyear,weekend
date,f64,f64,f64,f64,i64,f64,i32,u32,u32,u32,u32,u32,u32,i64
1995-10-16,36.249999,36.999999,35.875,36.124999,45516800,1.183897,1995,4,10,42,0,16,289,0


In [15]:
# выполним конкатенацию датафреймов по вертикали
df_v1 = pl.DataFrame(
    {
        'a': [1],
        'b': [3],
    }
)

df_v2 = pl.DataFrame(
    {
        'a': [2],
        'b': [4],
    }
)

df_vertical_concat = pl.concat(
    [
        df_v1,
        df_v2,
    ],
    how='vertical'
)

display(df_v1, df_v2, df_vertical_concat)

a,b
i64,i64
1,3


a,b
i64,i64
2,4


a,b
i64,i64
1,3
2,4


In [16]:
# выполним конкатенацию датафреймов по горизонтали
df_h1 = pl.DataFrame(
    {
        'l1': [1, 2],
        'l2': [3, 4],
    }
)

df_h2 = pl.DataFrame(
    {
        'r1': [5, 6],
        'r2': [7, 8],
        'r3': [9, 10],
    }
)

df_horizontal_concat = pl.concat(
    [
        df_h1,
        df_h2,
    ],
    how='horizontal'
)

display(df_h1, df_h2, df_horizontal_concat)

l1,l2
i64,i64
1,3
2,4


r1,r2,r3
i64,i64,i64
5,7,9
6,8,10


l1,l2,r1,r2,r3
i64,i64,i64,i64,i64
1,3,5,7,9
2,4,6,8,10


In [17]:
# выполним конкатенацию датафреймов по диагонали
df_d1 = pl.DataFrame(
    {
        'a': [1],
        'b': [3],
    }
)

df_d2 = pl.DataFrame(
    {
        'a': [2],
        'd': [4],
    }
)

df_diagonal_concat = pl.concat(
    [
        df_d1,
        df_d2,
    ],
    how='diagonal',
)

display(df_d1, df_d2, df_diagonal_concat)

a,b
i64,i64
1,3


a,d
i64,i64
2,4


a,b,d
i64,i64,i64
1,3.0,
2,,4.0


In [18]:
# датафрейм - идентификатор и марка машины
df_cars = pl.DataFrame(
    {
        'id': ['a', 'b', 'c'],
        'make': ['ford', 'toyota', 'bmw'],
    }
)
print(df_cars)

shape: (3, 2)
┌─────┬────────┐
│ id  ┆ make   │
│ --- ┆ ---    │
│ str ┆ str    │
╞═════╪════════╡
│ a   ┆ ford   │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ b   ┆ toyota │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ c   ┆ bmw    │
└─────┴────────┘


In [19]:
# датафрейм - идентификатор и стоимость ремонта
df_repairs = pl.DataFrame(
    {
        'id': ['c', 'c'],
        'cost': [100, 200],
    }
)
print(df_repairs)

shape: (2, 2)
┌─────┬──────┐
│ id  ┆ cost │
│ --- ┆ ---  │
│ str ┆ i64  │
╞═════╪══════╡
│ c   ┆ 100  │
├╌╌╌╌╌┼╌╌╌╌╌╌┤
│ c   ┆ 200  │
└─────┴──────┘


In [20]:
# применим inner join
df_inner_join = df_cars.join(df_repairs, on='id', how='inner')
print(df_inner_join)

shape: (2, 3)
┌─────┬──────┬──────┐
│ id  ┆ make ┆ cost │
│ --- ┆ ---  ┆ ---  │
│ str ┆ str  ┆ i64  │
╞═════╪══════╪══════╡
│ c   ┆ bmw  ┆ 100  │
├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ c   ┆ bmw  ┆ 200  │
└─────┴──────┴──────┘


In [21]:
# применим outer join
df_outer_join = df_cars.join(df_repairs, on='id', how='outer')
print(df_outer_join)

shape: (4, 3)
┌─────┬────────┬──────┐
│ id  ┆ make   ┆ cost │
│ --- ┆ ---    ┆ ---  │
│ str ┆ str    ┆ i64  │
╞═════╪════════╪══════╡
│ a   ┆ ford   ┆ null │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ b   ┆ toyota ┆ null │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ c   ┆ bmw    ┆ 100  │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ c   ┆ bmw    ┆ 200  │
└─────┴────────┴──────┘


In [22]:
# применим semi join
df_semi_join = df_cars.join(df_repairs, on='id', how='semi')
print(df_semi_join)

shape: (1, 2)
┌─────┬──────┐
│ id  ┆ make │
│ --- ┆ ---  │
│ str ┆ str  │
╞═════╪══════╡
│ c   ┆ bmw  │
└─────┴──────┘


In [23]:
# применим anti join
df_anti_join = df_cars.join(df_repairs, on='id', how='anti')
print(df_anti_join)

shape: (2, 2)
┌─────┬────────┐
│ id  ┆ make   │
│ --- ┆ ---    │
│ str ┆ str    │
╞═════╪════════╡
│ a   ┆ ford   │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ b   ┆ toyota │
└─────┴────────┘
