# Почему Polars?
# Библиотека Polars построена на Arrow
# Polars работает с табличными данными
# Объекты DataFrame и Series

In [1]:
# импортируем polars и pandas
import polars as pl
import pandas as pd

from IPython.display import display

In [2]:
# создаем датафрейм
df = pl.DataFrame({'Empl': [10, 20], 
                   'Age': [30, 40]})
df

Empl,Age
i64,i64
10,30
20,40


In [3]:
# вычисляем среднее по строкам
df.mean(axis=0)

Empl,Age
f64,f64
15.0,35.0


In [4]:
# вычисляем среднее по столбцам
df.mean(axis=1)

shape: (2,)
Series: 'Empl' [f64]
[
	20.0
	30.0
]

In [5]:
%%time

# считываем файл в датафрейм pandas
pandas_df = pd.read_csv('Data/train_data.csv')

CPU times: user 1min 58s, sys: 33.5 s, total: 2min 32s
Wall time: 2min 46s


In [6]:
%%time

# считываем файл в датафрейм polars
polars_df = pl.read_csv('Data/train_data.csv')

CPU times: user 1min 22s, sys: 1min 13s, total: 2min 35s
Wall time: 25.2 s


In [7]:
%%time

# считываем файл в датафрейм polars,
# потом в датафрейм pandas
pandas_df = pl.read_csv('Data/train_data.csv', rechunk=False).to_pandas()

CPU times: user 1min 29s, sys: 2min 15s, total: 3min 45s
Wall time: 45.7 s


In [8]:
# смотрим первые 5 наблюдений
polars_df.head()

customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,D_43,D_44,B_4,D_45,B_5,R_2,D_46,D_47,D_48,D_49,B_6,B_7,B_8,D_50,D_51,B_9,R_3,D_52,P_3,B_10,D_53,S_5,B_11,S_6,D_54,R_4,...,S_27,D_113,D_114,D_115,D_116,D_117,D_118,D_119,D_120,D_121,D_122,D_123,D_124,D_125,D_126,D_127,D_128,D_129,B_41,B_42,D_130,D_131,D_132,D_133,R_28,D_134,D_135,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,...,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f64,f64,str,f64,f64,str,str,str,str,str,f64,f64,f64,str,f64,f64,f64
"""0000099d6bd597...","""2017-03-09""",0.938469,0.001733,0.008724,1.006838,0.009228,0.124035,0.008771,0.004709,,,0.00063,0.080986,0.708906,0.1706,0.006204,0.358587,0.525351,0.255736,,0.063902,0.059416,0.006466,0.148698,1.335856,0.008207,0.001423,0.207334,0.736463,0.096219,,0.023381,0.002768,0.008322,1.001519,0.008298,...,0.676922,0.007871,1.0,0.23825,0.0,4.0,0.23212,0.236266,0.0,0.70228,0.434345,0.003057,0.686516,0.00874,1.0,1.003319,1.007819,1.00008,0.006805,,0.002052,0.005972,,0.004345,0.001535,,,,,,0.002427,0.003706,0.003818,,0.000569,0.00061,0.002674
"""0000099d6bd597...","""2017-04-07""",0.936665,0.005775,0.004923,1.000653,0.006151,0.12675,0.000798,0.002714,,,0.002526,0.069419,0.712795,0.113239,0.006206,0.35363,0.521311,0.223329,,0.065261,0.057744,0.001614,0.149723,1.339794,0.008373,0.001984,0.202778,0.720886,0.099804,,0.030599,0.002749,0.002482,1.009033,0.005136,...,0.822281,0.003444,1.0,0.247217,0.0,4.0,0.243532,0.241885,0.0,0.707017,0.430501,0.001306,0.686414,0.000755,1.0,1.008394,1.004333,1.008344,0.004407,,0.001034,0.004838,,0.007495,0.004931,,,,,,0.003954,0.003167,0.005032,,0.009576,0.005492,0.009217
"""0000099d6bd597...","""2017-05-28""",0.95418,0.091505,0.021655,1.009672,0.006815,0.123977,0.007598,0.009423,,,0.007605,0.068839,0.720884,0.060492,0.003259,0.33465,0.524568,0.189424,,0.066982,0.056647,0.005126,0.151955,1.337179,0.009355,0.007426,0.206629,0.738044,0.134073,,0.048367,0.010077,0.00053,1.009184,0.006961,...,0.853498,0.003269,1.0,0.239867,0.0,4.0,0.240768,0.23971,0.0,0.704843,0.434409,0.003954,0.690101,0.009617,1.0,1.009307,1.007831,1.006878,0.003221,,0.005681,0.005497,,0.009227,0.009123,,,,,,0.003269,0.007329,0.000427,,0.003429,0.006986,0.002603
"""0000099d6bd597...","""2017-06-13""",0.960384,0.002455,0.013683,1.0027,0.001373,0.117169,0.000685,0.005531,,,0.006406,0.05563,0.723997,0.166782,0.009918,0.323271,0.530929,0.135586,,0.08372,0.049253,0.001418,0.151219,1.339909,0.006782,0.003515,0.208214,0.741813,0.134437,,0.030063,0.009667,0.000783,1.007456,0.008706,...,0.844667,5.3e-05,1.0,0.24091,0.0,4.0,0.2394,0.240727,0.0,0.711546,0.436903,0.005135,0.687779,0.004649,1.0,1.001671,1.00346,1.007573,0.007703,,0.007108,0.008261,,0.007206,0.002409,,,,,,0.006117,0.004516,0.0032,,0.008419,0.006527,0.0096
"""0000099d6bd597...","""2017-07-16""",0.947248,0.002483,0.015193,1.000727,0.007605,0.117325,0.004653,0.009312,,,0.007731,0.038862,0.720619,0.14363,0.006667,0.231009,0.529305,,,0.0759,0.048918,0.001199,0.154026,1.341735,0.000519,0.001362,0.205468,0.691986,0.121518,,0.054221,0.009484,0.006698,1.003738,0.003846,...,0.811199,0.008724,1.0,0.247939,0.0,4.0,0.244199,0.242325,0.0,0.705343,0.437433,0.002849,0.688774,9.7e-05,1.0,1.009886,1.005053,1.008132,0.009823,,0.00968,0.004848,,0.006312,0.004462,,,,,,0.003671,0.004946,0.008889,,0.00167,0.008126,0.009827


# Режим Eager и Lazy

In [9]:
%%time

# считываем файл
df = pl.read_csv('https://j.mp/iriscsv')
# вычисляем групповые статистики в режиме Eager
print(df.filter(pl.col('sepal_length') > 5)
      .groupby('species')
      .agg(pl.all().sum())
)

shape: (3, 5)
┌────────────┬──────────────┬─────────────┬──────────────┬─────────────┐
│ species    ┆ sepal_length ┆ sepal_width ┆ petal_length ┆ petal_width │
│ ---        ┆ ---          ┆ ---         ┆ ---          ┆ ---         │
│ str        ┆ f64          ┆ f64         ┆ f64          ┆ f64         │
╞════════════╪══════════════╪═════════════╪══════════════╪═════════════╡
│ virginica  ┆ 324.5        ┆ 146.2       ┆ 273.1        ┆ 99.6        │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ versicolor ┆ 281.9        ┆ 131.8       ┆ 202.9        ┆ 63.3        │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ setosa     ┆ 116.9        ┆ 81.7        ┆ 33.2         ┆ 6.1         │
└────────────┴──────────────┴─────────────┴──────────────┴─────────────┘
CPU times: user 36.6 ms, sys: 32.3 ms, total: 69 ms
Wall time: 703 ms


In [10]:
%%time

# вычисляем групповые статистики в режиме Lazy
print(
    pl.read_csv('https://j.mp/iriscsv')
    .lazy()
    .filter(pl.col('sepal_length') > 5)
    .groupby('species')
    .agg(pl.all().sum())
    .collect()
)

shape: (3, 5)
┌────────────┬──────────────┬─────────────┬──────────────┬─────────────┐
│ species    ┆ sepal_length ┆ sepal_width ┆ petal_length ┆ petal_width │
│ ---        ┆ ---          ┆ ---         ┆ ---          ┆ ---         │
│ str        ┆ f64          ┆ f64         ┆ f64          ┆ f64         │
╞════════════╪══════════════╪═════════════╪══════════════╪═════════════╡
│ virginica  ┆ 324.5        ┆ 146.2       ┆ 273.1        ┆ 99.6        │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ versicolor ┆ 281.9        ┆ 131.8       ┆ 202.9        ┆ 63.3        │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ setosa     ┆ 116.9        ┆ 81.7        ┆ 33.2         ┆ 6.1         │
└────────────┴──────────────┴─────────────┴──────────────┴─────────────┘
CPU times: user 27.4 ms, sys: 21 ms, total: 48.4 ms
Wall time: 454 ms


In [11]:
# переходим от режима Eager к режиму Lazy
(
    df.lazy()
    .filter(pl.col('sepal_length') > 5)
    .groupby('species')
    .agg(pl.all().sum())
    .collect()
)

species,sepal_length,sepal_width,petal_length,petal_width
str,f64,f64,f64,f64
"""virginica""",324.5,146.2,273.1,99.6
"""setosa""",116.9,81.7,33.2,6.1
"""versicolor""",281.9,131.8,202.9,63.3


# Задачи, выполняемые Polars
# Кратко о типах данных
# Представление пропусков
# Какую версию Polars использовать?

In [12]:
# смотрим версию
pl.__version__

'0.14.18'

# Главные отличия Polars от pandas

# Подробно знакомимся с типами данных
## Типы данных для работы с числами и логическими значениями
### Тип данных Int (тип для целых чисел, целочисленный тип)

In [13]:
# создаем серию целочисленных значений
s_int = pl.Series([10, 35, 130]) 
s_int

shape: (3,)
Series: '' [i64]
[
	10
	35
	130
]

In [14]:
# сменим тип на Int8
s_int.cast(pl.Int8)

ComputeError: Strict conversion from Int64 to Int8 failed for values [130]. If you were trying to cast Utf8 to Date, Time, or Datetime, consider using `strptime`.

In [15]:
# создаем серию с пропусками типа Int64
pl.Series([10, 35, 130, None], dtype=pl.Int64)

shape: (4,)
Series: '' [i64]
[
	10
	35
	130
	null
]

In [16]:
# серии целочисленных значений с пропусками 
# будет присвоен тип Int64
pl.Series([10, 35, 130, None])

shape: (4,)
Series: '' [i64]
[
	10
	35
	130
	null
]

### Тип данных UInt (тип для целых чисел без знака)

In [17]:
# сменим тип на UInt8
s_int.cast(pl.UInt8)

shape: (3,)
Series: '' [u8]
[
	10
	35
	130
]

### Тип данных Float (тип для чисел с плавающей точкой)

In [18]:
# создаем серию с типом Float64
s_float = pl.Series([5.26, 1234.56789, None])
s_float

shape: (3,)
Series: '' [f64]
[
	5.26
	1234.56789
	null
]

In [19]:
# присвоим тип Float32
s_float.cast(pl.Float32)

shape: (3,)
Series: '' [f32]
[
	5.26
	1234.567871
	null
]

In [20]:
# переведем из Float64 в Int64
s_int = s_float.cast(pl.Int64)
s_int

shape: (3,)
Series: '' [i64]
[
	5
	1234
	null
]

In [21]:
# переведем из Int64 в Float64
s_float = s_int.cast(pl.Float64)
s_float

shape: (3,)
Series: '' [f64]
[
	5.0
	1234.0
	null
]

### Тип данных Boolean (логический тип, булев тип)

In [22]:
# создадим серию логических значений
s_bool = pl.Series([True, False]) 
s_bool

shape: (2,)
Series: '' [bool]
[
	true
	false
]

In [23]:
# создаем серию с типом Int64
s = pl.Series([0, 1, 59, -35])

In [24]:
# преобразовываем в тип Boolean
s.cast(pl.Boolean)

shape: (4,)
Series: '' [bool]
[
	false
	true
	true
	true
]

In [25]:
# создаем серию с типом Float
s = pl.Series([0, 0.0001, -3.99])

In [26]:
# преобразовываем в тип Boolean
s.cast(pl.Boolean)

shape: (3,)
Series: '' [bool]
[
	false
	true
	true
]

In [27]:
# преобразуем из типа Boolean в тип Int
s_bool.cast(pl.Int64)

shape: (2,)
Series: '' [i64]
[
	1
	0
]

In [28]:
# создаем серию с типом Boolean
s = pl.Series([True, False, None], dtype=pl.Boolean)
s

shape: (3,)
Series: '' [bool]
[
	true
	false
	null
]

## Типы данных для работы со строками
### Тип данных Object (объектный тип)

In [29]:
# создаем серию со строковыми значениями
s_object = pl.Series(['some', 'strings'], dtype=pl.Object) 
s_object

shape: (2,)
Series: '' [o][object]
[
	some
	strings
]

In [30]:
# проверим тип
s_object.dtype

polars.datatypes.Object

In [31]:
# присвоим серии с логическими значениями тип Object
s = pl.Series([True, False])
s.cast(pl.Object)

thread '<unnamed>' panicked at 'cannot convert object to arrow', /Users/runner/work/polars/polars/polars/polars-core/src/datatypes/dtype.rs:204:26
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace


PanicException: cannot convert object to arrow

In [32]:
# присвоим серии с вещественными 
# значениями тип Object
s = pl.Series([5.2, 10.3])
s.cast(pl.Object)

thread '<unnamed>' panicked at 'cannot convert object to arrow', /Users/runner/work/polars/polars/polars/polars-core/src/datatypes/dtype.rs:204:26


PanicException: cannot convert object to arrow

In [33]:
# присвоим серии с целыми числами тип Object
s = pl.Series([5, 10])
s.cast(pl.Object)

thread '<unnamed>' panicked at 'cannot convert object to arrow', /Users/runner/work/polars/polars/polars/polars-core/src/datatypes/dtype.rs:204:26


PanicException: cannot convert object to arrow

In [34]:
# серия с типом Object может содержать все что угодно
garbage_series = pl.Series([[1,2], True, 'some string', 
                            4.5, {'key': 'value'}])
garbage_series

shape: (5,)
Series: '' [o][object]
[
	[1, 2]
	True
	some string
	4.5
	{'key': 'value'}
]

In [35]:
# элементом серии с типом Object
# может быть все что угодно
print(type(garbage_series[0]))
print(type(garbage_series[1]))
print(type(garbage_series[2]))
print(type(garbage_series[3]))
print(type(garbage_series[4]))

<class 'list'>
<class 'bool'>
<class 'str'>
<class 'float'>
<class 'dict'>


### Тип данных Categorical (категориальный тип)

In [36]:
# записываем CSV-файл в объект DataFrame
bikes = pl.read_csv('Data/bikes.csv')
# выводим первые 5 наблюдений датафрейма
bikes.head()

gender,starttime,stoptime,tripduration,from_station_name,start_capacity,to_station_name,end_capacity,temperature,wind_speed,events
str,str,str,i64,str,f64,str,f64,f64,f64,str
"""Male""","""2013-06-28 19:...","""2013-06-28 19:...",993,"""Lake Shore Dr ...",11.0,"""Michigan Ave &...",15.0,73.9,12.7,"""mostlycloudy"""
"""Male""","""2013-06-28 22:...","""2013-06-28 23:...",623,"""Clinton St & W...",31.0,"""Wells St & Wal...",19.0,69.1,6.9,"""partlycloudy"""
"""Male""","""2013-06-30 14:...","""2013-06-30 15:...",1040,"""Sheffield Ave ...",15.0,"""Dearborn St & ...",23.0,73.0,16.1,"""mostlycloudy"""
"""Male""","""2013-07-01 10:...","""2013-07-01 10:...",667,"""Carpenter St &...",19.0,"""Clark St & Ran...",31.0,72.0,16.1,"""mostlycloudy"""
"""Male""","""2013-07-01 11:...","""2013-07-01 11:...",130,"""Damen Ave & Pi...",19.0,"""Damen Ave & Pi...",19.0,73.0,17.3,"""partlycloudy"""


In [37]:
# смотрим частоты категорий events
events = bikes['events']
events.value_counts()

events,counts
str,u32
"""fog""",122
"""hazy""",348
"""cloudy""",12075
"""sleet""",16
"""snow""",466
"""unknown""",4
"""rain""",1828
"""clear""",2818
"""mostlycloudy""",15096
"""partlycloudy""",16998


In [38]:
# присваиваем тип Categorical
events_cat = events.cast(pl.Categorical)
events_cat.head()

shape: (10,)
Series: 'events' [cat]
[
	"mostlycloudy"
	"partlycloudy"
	"mostlycloudy"
	"mostlycloudy"
	"partlycloudy"
	"mostlycloudy"
	"cloudy"
	"cloudy"
	"cloudy"
	"mostlycloudy"
]

In [39]:
# смотрим тип серии
events_cat.dtype

polars.datatypes.Categorical

### Тип данных Utf8 (строковый тип)

In [40]:
# создаем серию с типом Utf8
s_string = pl.Series(['Python', 'Java', 'Scala', None], 
                     dtype=pl.Utf8)
s_string

shape: (4,)
Series: '' [str]
[
	"Python"
	"Java"
	"Scala"
	null
]

In [41]:
# сделаем буквы заглавными
s_string.str.to_uppercase()

shape: (4,)
Series: '' [str]
[
	"PYTHON"
	"JAVA"
	"SCALA"
	null
]

In [42]:
# серии с целыми числами присваиваем тип Utf8
s = pl.Series([10, 20, 99])
s.cast(pl.Utf8)

shape: (3,)
Series: '' [str]
[
	"10"
	"20"
	"99"
]

In [43]:
# серии чисел с плавающей точкой
# присваиваем тип Utf8
s = pl.Series([10.5, 20.3, 99.1])
s.cast(pl.Utf8)

shape: (3,)
Series: '' [str]
[
	"10.5"
	"20.3"
	"99.1"
]

In [44]:
# создаем серию со строками, выглядящими 
# как целые числа
s = pl.Series(['0', '1'])
s

shape: (2,)
Series: '' [str]
[
	"0"
	"1"
]

In [45]:
# переводим в тип Int64
s.cast(pl.Int64)

shape: (2,)
Series: '' [i64]
[
	0
	1
]

In [46]:
# создаем серию со строками, выглядящими 
# как числа с плавающей точкой
s = pl.Series(['4.5', '3.19'])
s

shape: (2,)
Series: '' [str]
[
	"4.5"
	"3.19"
]

In [47]:
# переводим в тип Float64
s.cast(pl.Float64)

shape: (2,)
Series: '' [f64]
[
	4.5
	3.19
]

In [48]:
# создаем серию со строковыми значениями
s = pl.Series(['4.5', '3.19', 'NO ANSWER'])
s

shape: (3,)
Series: '' [str]
[
	"4.5"
	"3.19"
	"NO ANSWER"
]

In [49]:
# переводим в тип Float64
s.cast(pl.Float64)

ComputeError: Strict conversion from Utf8 to Float64 failed for values ["NO ANSWER"]. If you were trying to cast Utf8 to Date, Time, or Datetime, consider using `strptime`.

## Типы данных для работы с моментами времени

### Типы данных Date и Datetime

In [50]:
# загружаем данные
df = pl.read_csv('Data/applestock.csv', parse_dates=True)
df.head()

Date,Open,High,Low,Close,Volume,Adj Close
date,f64,f64,f64,f64,i64,f64
2016-05-16,92.389999,94.389999,91.650002,93.879997,61140600,93.879997
2016-05-13,90.0,91.669998,90.0,90.519997,44188200,90.519997
2016-05-12,92.720001,92.779999,89.470001,90.339996,76109800,90.339996
2016-05-11,93.480003,93.57,92.459999,92.510002,28539900,92.510002
2016-05-10,93.330002,93.57,92.110001,93.419998,33592500,93.419998


In [51]:
# загружаем данные, даты обрабатываются как строки
df = pl.read_csv('Data/applestock.csv', parse_dates=False)
df.head()

Date,Open,High,Low,Close,Volume,Adj Close
str,f64,f64,f64,f64,i64,f64
"""2016-05-16""",92.389999,94.389999,91.650002,93.879997,61140600,93.879997
"""2016-05-13""",90.0,91.669998,90.0,90.519997,44188200,90.519997
"""2016-05-12""",92.720001,92.779999,89.470001,90.339996,76109800,90.339996
"""2016-05-11""",93.480003,93.57,92.459999,92.510002,28539900,92.510002
"""2016-05-10""",93.330002,93.57,92.110001,93.419998,33592500,93.419998


In [52]:
# переводим переменную с датами в тип Date
df = df.with_column(pl.col('Date').str.strptime(
    pl.Date, fmt='%Y-%m-%d'))
df.head()

Date,Open,High,Low,Close,Volume,Adj Close
date,f64,f64,f64,f64,i64,f64
2016-05-16,92.389999,94.389999,91.650002,93.879997,61140600,93.879997
2016-05-13,90.0,91.669998,90.0,90.519997,44188200,90.519997
2016-05-12,92.720001,92.779999,89.470001,90.339996,76109800,90.339996
2016-05-11,93.480003,93.57,92.459999,92.510002,28539900,92.510002
2016-05-10,93.330002,93.57,92.110001,93.419998,33592500,93.419998


In [53]:
# записываем датафрейм на основе CSV-файла, 
# содержащего даты и время
df = pl.read_csv('Data/pickup_dates.csv', parse_dates=False)
df

pickup_datetime
str
"""14.03.2016 17:..."
"""12.06.2016 0:4..."
"""19.01.2016 11:..."
"""06.04.2016 19:..."
"""26.03.2016 13:..."
"""30.01.2016 22:..."


In [54]:
# переводим переменную с датами 
# и временем в тип Datetime
df = df.with_column(
    pl.col('pickup_datetime').str.strptime(
        pl.Datetime, fmt='%d.%m.%Y %H:%M'))
df

pickup_datetime
datetime[μs]
2016-03-14 17:24:00
2016-06-12 00:43:00
2016-01-19 11:35:00
2016-04-06 19:32:00
2016-03-26 13:30:00
2016-01-30 22:01:00


### Тип данных Duration

In [55]:
# из даты второго наблюдения вычтем 
# дату первого наблюдения
df[1] - df[0]

pickup_datetime
duration[μs]
89d 7h 19m


# Чтение данных

In [56]:
# загружаем данные
bikes = pl.read_csv('Data/bikes.csv', parse_dates=True)

In [57]:
# выведем первые 3 наблюдения
bikes.head(3)

gender,starttime,stoptime,tripduration,from_station_name,start_capacity,to_station_name,end_capacity,temperature,wind_speed,events
str,datetime[μs],datetime[μs],i64,str,f64,str,f64,f64,f64,str
"""Male""",2013-06-28 19:01:00,2013-06-28 19:17:00,993,"""Lake Shore Dr ...",11.0,"""Michigan Ave &...",15.0,73.9,12.7,"""mostlycloudy"""
"""Male""",2013-06-28 22:53:00,2013-06-28 23:03:00,623,"""Clinton St & W...",31.0,"""Wells St & Wal...",19.0,69.1,6.9,"""partlycloudy"""
"""Male""",2013-06-30 14:43:00,2013-06-30 15:01:00,1040,"""Sheffield Ave ...",15.0,"""Dearborn St & ...",23.0,73.0,16.1,"""mostlycloudy"""


In [58]:
# выведем последние 3 наблюдения
bikes.tail(3)

gender,starttime,stoptime,tripduration,from_station_name,start_capacity,to_station_name,end_capacity,temperature,wind_speed,events
str,datetime[μs],datetime[μs],i64,str,f64,str,f64,f64,f64,str
"""Male""",2017-12-30 13:34:00,2017-12-30 13:48:00,824,"""Kingsbury St &...",31.0,"""Halsted St & B...",20.0,5.0,16.1,"""partlycloudy"""
"""Female""",2017-12-31 09:30:00,2017-12-31 09:33:00,178,"""Clinton St & L...",23.0,"""Kingsbury St &...",31.0,7.0,11.5,"""partlycloudy"""
"""Male""",2017-12-31 15:22:00,2017-12-31 15:26:00,214,"""Clarendon Ave ...",15.0,"""Clifton Ave & ...",15.0,10.9,15.0,"""partlycloudy"""


# Изменение настроек вывода 

In [59]:
# задаем максимальное количество строк и столбцов
# pl.Config.set_tbl_rows(100)  
# pl.Config.set_tbl_cols(30) 

# Получение общей информации о датафрейме

In [60]:
# смотрим количество наблюдений
# и количество переменных
print(bikes.shape)

(50089, 11)


In [61]:
# смотрим количество наблюдений
print(len(bikes))

50089


In [62]:
# выведем имена столбцов
print(bikes.columns)

['gender', 'starttime', 'stoptime', 'tripduration', 'from_station_name', 'start_capacity', 'to_station_name', 'end_capacity', 'temperature', 'wind_speed', 'events']


In [63]:
# смотрим типы переменных
var_types = dict(zip(bikes.columns, bikes.dtypes))
var_types

{'gender': polars.datatypes.Utf8,
 'starttime': datetime[μs],
 'stoptime': datetime[μs],
 'tripduration': polars.datatypes.Int64,
 'from_station_name': polars.datatypes.Utf8,
 'start_capacity': polars.datatypes.Float64,
 'to_station_name': polars.datatypes.Utf8,
 'end_capacity': polars.datatypes.Float64,
 'temperature': polars.datatypes.Float64,
 'wind_speed': polars.datatypes.Float64,
 'events': polars.datatypes.Utf8}

In [64]:
# еще можно так
for col in bikes.get_columns():
    print(f"{col.name} - {col.dtype}")

gender - <class 'polars.datatypes.Utf8'>
starttime - datetime[μs]
stoptime - datetime[μs]
tripduration - <class 'polars.datatypes.Int64'>
from_station_name - <class 'polars.datatypes.Utf8'>
start_capacity - <class 'polars.datatypes.Float64'>
to_station_name - <class 'polars.datatypes.Utf8'>
end_capacity - <class 'polars.datatypes.Float64'>
temperature - <class 'polars.datatypes.Float64'>
wind_speed - <class 'polars.datatypes.Float64'>
events - <class 'polars.datatypes.Utf8'>


In [65]:
# выведем частоты категорий по некоторым 
# категориальным переменным
lst = ['gender', 'events']
for col in lst:
    print(bikes[col].value_counts())

shape: (2, 2)
┌────────┬────────┐
│ gender ┆ counts │
│ ---    ┆ ---    │
│ str    ┆ u32    │
╞════════╪════════╡
│ Female ┆ 12435  │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ Male   ┆ 37654  │
└────────┴────────┘
shape: (11, 2)
┌──────────────┬────────┐
│ events       ┆ counts │
│ ---          ┆ ---    │
│ str          ┆ u32    │
╞══════════════╪════════╡
│ partlycloudy ┆ 16998  │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ rain         ┆ 1828   │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ hazy         ┆ 348    │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ cloudy       ┆ 12075  │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ ...          ┆ ...    │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ unknown      ┆ 4      │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ sleet        ┆ 16     │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ tstorms      ┆ 318    │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ fog          ┆ 122    │
└──────────────┴────────┘


In [66]:
# выведем уникальные значения по некоторым 
# категориальным переменным
for col in lst:
    print(bikes.select([col]).unique())

shape: (2, 1)
┌────────┐
│ gender │
│ ---    │
│ str    │
╞════════╡
│ Male   │
├╌╌╌╌╌╌╌╌┤
│ Female │
└────────┘
shape: (11, 1)
┌──────────────┐
│ events       │
│ ---          │
│ str          │
╞══════════════╡
│ mostlycloudy │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ partlycloudy │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ cloudy       │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ clear        │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ ...          │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ fog          │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ snow         │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ sleet        │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ unknown      │
└──────────────┘


# Отбор данных
## Отбор с помощью индексирования

In [67]:
# отберем один столбец
bikes['gender']

shape: (50089,)
Series: 'gender' [str]
[
	"Male"
	"Male"
	"Male"
	"Male"
	"Male"
	"Male"
	"Male"
	"Male"
	"Male"
	"Female"
	"Male"
	"Male"
	...
	"Male"
	"Female"
	"Male"
	"Male"
	"Male"
	"Male"
	"Male"
	"Male"
	"Male"
	"Male"
	"Female"
	"Male"
]

In [68]:
# извлекаем несколько столбцов, передав
# индексатору [] список
bikes[['gender', 'tripduration']]

gender,tripduration
str,i64
"""Male""",993
"""Male""",623
"""Male""",1040
"""Male""",667
"""Male""",130
"""Male""",660
"""Male""",565
"""Male""",505
"""Male""",1300
"""Female""",922


In [69]:
# отбираем первые две строки столбцов
# start_capacity и tripduration, передав
# в [] список строк, список столбцов
bikes[[0, 1], ['start_capacity', 'tripduration']]

start_capacity,tripduration
f64,i64
11.0,993
31.0,623


In [70]:
# отбираем первые четыре строки столбцов
# с gender по tripduration, передав
# в [] диапазон строк, диапазон столбцов
bikes[0:4, 'gender':'tripduration']

gender,starttime,stoptime,tripduration
str,datetime[μs],datetime[μs],i64
"""Male""",2013-06-28 19:01:00,2013-06-28 19:17:00,993
"""Male""",2013-06-28 22:53:00,2013-06-28 23:03:00,623
"""Male""",2013-06-30 14:43:00,2013-06-30 15:01:00,1040
"""Male""",2013-07-01 10:05:00,2013-07-01 10:16:00,667


In [71]:
# отберем каждую 2-ю строку каждого 2-го столбца, 
# передав в [] диапазон строк, диапазон столбцов
bikes[0::2, 'gender':'events':2]

gender,stoptime,from_station_name,to_station_name,temperature,events
str,datetime[μs],str,str,f64,str
"""Male""",2013-06-28 19:17:00,"""Lake Shore Dr ...","""Michigan Ave &...",73.9,"""mostlycloudy"""
"""Male""",2013-06-30 15:01:00,"""Sheffield Ave ...","""Dearborn St & ...",73.0,"""mostlycloudy"""
"""Male""",2013-07-01 11:18:00,"""Damen Ave & Pi...","""Damen Ave & Pi...",73.0,"""partlycloudy"""
"""Male""",2013-07-02 17:56:00,"""Clark St & Ran...","""Ravenswood Ave...",66.0,"""cloudy"""
"""Male""",2013-07-03 15:42:00,"""Clinton St & W...","""Wood St & Divi...",71.1,"""cloudy"""
"""Male""",2013-07-04 17:42:00,"""Morgan St & 18...","""Damen Ave & Pi...",79.0,"""mostlycloudy"""
"""Male""",2013-07-05 10:40:00,"""Jefferson St &...","""Jefferson St &...",79.0,"""partlycloudy"""
"""Female""",2013-07-06 12:49:00,"""Morgan St & La...","""Aberdeen St & ...",82.0,"""mostlycloudy"""
"""Male""",2013-07-08 17:14:00,"""Clinton St & T...","""State St & Har...",84.0,"""mostlycloudy"""
"""Male""",2013-07-09 14:42:00,"""Canal St & Jac...","""Millennium Par...",79.0,"""cloudy"""


In [72]:
# а можно было так
bikes[0:50089:2, 'gender':'events':2]

gender,stoptime,from_station_name,to_station_name,temperature,events
str,datetime[μs],str,str,f64,str
"""Male""",2013-06-28 19:17:00,"""Lake Shore Dr ...","""Michigan Ave &...",73.9,"""mostlycloudy"""
"""Male""",2013-06-30 15:01:00,"""Sheffield Ave ...","""Dearborn St & ...",73.0,"""mostlycloudy"""
"""Male""",2013-07-01 11:18:00,"""Damen Ave & Pi...","""Damen Ave & Pi...",73.0,"""partlycloudy"""
"""Male""",2013-07-02 17:56:00,"""Clark St & Ran...","""Ravenswood Ave...",66.0,"""cloudy"""
"""Male""",2013-07-03 15:42:00,"""Clinton St & W...","""Wood St & Divi...",71.1,"""cloudy"""
"""Male""",2013-07-04 17:42:00,"""Morgan St & 18...","""Damen Ave & Pi...",79.0,"""mostlycloudy"""
"""Male""",2013-07-05 10:40:00,"""Jefferson St &...","""Jefferson St &...",79.0,"""partlycloudy"""
"""Female""",2013-07-06 12:49:00,"""Morgan St & La...","""Aberdeen St & ...",82.0,"""mostlycloudy"""
"""Male""",2013-07-08 17:14:00,"""Clinton St & T...","""State St & Har...",84.0,"""mostlycloudy"""
"""Male""",2013-07-09 14:42:00,"""Canal St & Jac...","""Millennium Par...",79.0,"""cloudy"""


In [73]:
# отбираем, начиная с пятой строки и столбца 
# from_station_name, передав в [] диапазон 
# строк, диапазон столбцов
bikes[4:, 'from_station_name':]

from_station_name,start_capacity,to_station_name,end_capacity,temperature,wind_speed,events
str,f64,str,f64,f64,f64,str
"""Damen Ave & Pi...",19.0,"""Damen Ave & Pi...",19.0,73.0,17.3,"""partlycloudy"""
"""California Ave...",15.0,"""Clark St & Wri...",15.0,73.0,17.3,"""mostlycloudy"""
"""Clark St & Ran...",31.0,"""Ravenswood Ave...",19.0,66.0,15.0,"""cloudy"""
"""State St & Van...",27.0,"""Franklin St & ...",27.0,64.0,5.8,"""cloudy"""
"""Clinton St & W...",31.0,"""Wood St & Divi...",15.0,71.1,0.0,"""cloudy"""
"""Lakeview Ave &...",19.0,"""Racine Ave & C...",19.0,81.0,12.7,"""mostlycloudy"""
"""Morgan St & 18...",15.0,"""Damen Ave & Pi...",19.0,79.0,9.2,"""mostlycloudy"""
"""Ashland Ave & ...",15.0,"""Lincoln Ave & ...",19.0,79.0,10.4,"""mostlycloudy"""
"""Jefferson St &...",19.0,"""Jefferson St &...",19.0,79.0,0.0,"""partlycloudy"""
"""May St & Rando...",15.0,"""Millennium Par...",35.0,78.1,5.8,"""partlycloudy"""


In [74]:
# отбираем столбцы start_capacity 
# и tripduration, передав в [] список 
# столбцов после двоеточия с запятой
bikes[:, ['start_capacity', 'tripduration']]

start_capacity,tripduration
f64,i64
11.0,993
31.0,623
15.0,1040
19.0,667
19.0,130
15.0,660
31.0,565
27.0,505
31.0,1300
19.0,922


In [75]:
# отбираем диапазон столбцов с помощью []
bikes[:, 'gender':'tripduration']

gender,starttime,stoptime,tripduration
str,datetime[μs],datetime[μs],i64
"""Male""",2013-06-28 19:01:00,2013-06-28 19:17:00,993
"""Male""",2013-06-28 22:53:00,2013-06-28 23:03:00,623
"""Male""",2013-06-30 14:43:00,2013-06-30 15:01:00,1040
"""Male""",2013-07-01 10:05:00,2013-07-01 10:16:00,667
"""Male""",2013-07-01 11:16:00,2013-07-01 11:18:00,130
"""Male""",2013-07-01 12:37:00,2013-07-01 12:48:00,660
"""Male""",2013-07-02 17:47:00,2013-07-02 17:56:00,565
"""Male""",2013-07-03 09:07:00,2013-07-03 09:16:00,505
"""Male""",2013-07-03 15:21:00,2013-07-03 15:42:00,1300
"""Female""",2013-07-04 15:00:00,2013-07-04 15:16:00,922


In [76]:
# отберем строки с метками индекса 1, 5 и 6
bikes[[1, 5, 6], :]

gender,starttime,stoptime,tripduration,from_station_name,start_capacity,to_station_name,end_capacity,temperature,wind_speed,events
str,datetime[μs],datetime[μs],i64,str,f64,str,f64,f64,f64,str
"""Male""",2013-06-28 22:53:00,2013-06-28 23:03:00,623,"""Clinton St & W...",31.0,"""Wells St & Wal...",19.0,69.1,6.9,"""partlycloudy"""
"""Male""",2013-07-01 12:37:00,2013-07-01 12:48:00,660,"""California Ave...",15.0,"""Clark St & Wri...",15.0,73.0,17.3,"""mostlycloudy"""
"""Male""",2013-07-02 17:47:00,2013-07-02 17:56:00,565,"""Clark St & Ran...",31.0,"""Ravenswood Ave...",19.0,66.0,15.0,"""cloudy"""


In [77]:
# отберем каждую 10-ю строку
bikes[0::10, :]

gender,starttime,stoptime,tripduration,from_station_name,start_capacity,to_station_name,end_capacity,temperature,wind_speed,events
str,datetime[μs],datetime[μs],i64,str,f64,str,f64,f64,f64,str
"""Male""",2013-06-28 19:01:00,2013-06-28 19:17:00,993,"""Lake Shore Dr ...",11.0,"""Michigan Ave &...",15.0,73.9,12.7,"""mostlycloudy"""
"""Male""",2013-07-04 17:17:00,2013-07-04 17:42:00,1523,"""Morgan St & 18...",15.0,"""Damen Ave & Pi...",19.0,79.0,9.2,"""mostlycloudy"""
"""Female""",2013-07-09 17:39:00,2013-07-09 17:55:00,943,"""State St & Van...",27.0,"""State St & 16t...",15.0,82.9,9.2,"""mostlycloudy"""
"""Male""",2013-07-12 12:32:00,2013-07-12 12:41:00,512,"""Jefferson St &...",19.0,"""Morgan St & La...",15.0,81.0,4.6,"""partlycloudy"""
"""Female""",2013-07-14 14:08:00,2013-07-14 15:53:00,6274,"""Wabash Ave & R...",19.0,"""Lake Shore Dr ...",11.0,87.1,8.1,"""partlycloudy"""
"""Female""",2013-07-16 08:22:00,2013-07-16 08:42:00,1189,"""Noble St & Mil...",15.0,"""Canal St & Jac...",35.0,80.1,5.8,"""partlycloudy"""
"""Male""",2013-07-17 10:23:00,2013-07-17 10:40:00,1024,"""Clinton St & W...",31.0,"""Larrabee St & ...",15.0,88.0,5.8,"""partlycloudy"""
"""Male""",2013-07-18 17:35:00,2013-07-18 17:47:00,678,"""McClurg Ct & I...",23.0,"""Wells St & Eri...",19.0,88.0,6.9,"""partlycloudy"""
"""Female""",2013-07-22 07:59:00,2013-07-22 08:19:00,1224,"""Lincoln Ave & ...",19.0,"""Dearborn St & ...",19.0,73.4,0.0,"""cloudy"""
"""Male""",2013-07-23 14:55:00,2013-07-23 15:08:00,782,"""LaSalle St & I...",19.0,"""Canal St & Jac...",35.0,77.0,9.2,"""mostlycloudy"""


In [78]:
# отбираем строки с индексами 2 и 3
# и столбцы с индексами 2 и 3
bikes[2:4, 2:4]

stoptime,tripduration
datetime[μs],i64
2013-06-30 15:01:00,1040
2013-07-01 10:16:00,667


In [79]:
# отбираем столбцы с индексами 3 и 5, передав в
# [] список столбцов после двоеточия с запятой
bikes[:, [3, 5]]

tripduration,start_capacity
i64,f64
993,11.0
623,31.0
1040,15.0
667,19.0
130,19.0
660,15.0
565,31.0
505,27.0
1300,31.0
922,19.0


In [80]:
# отбираем строки с индексами 3 и 5, передав в
# [] список строк перед запятой с двоеточием
bikes[[3, 5], :]

gender,starttime,stoptime,tripduration,from_station_name,start_capacity,to_station_name,end_capacity,temperature,wind_speed,events
str,datetime[μs],datetime[μs],i64,str,f64,str,f64,f64,f64,str
"""Male""",2013-07-01 10:05:00,2013-07-01 10:16:00,667,"""Carpenter St &...",19.0,"""Clark St & Ran...",31.0,72.0,16.1,"""mostlycloudy"""
"""Male""",2013-07-01 12:37:00,2013-07-01 12:48:00,660,"""California Ave...",15.0,"""Clark St & Wri...",15.0,73.0,17.3,"""mostlycloudy"""


In [81]:
# отбираем строку с индексом 3 и
# столбец tripduration
bikes[3, 'tripduration']

667

In [82]:
# отбираем строку с индексом 3 и
# столбец с индексом 3
bikes[3, 3]

667

## Отбор с помощью выражений

In [83]:
# отбор строк по одному условию
filt = pl.col('tripduration') > 5000
bikes.filter(filt).head(3)

gender,starttime,stoptime,tripduration,from_station_name,start_capacity,to_station_name,end_capacity,temperature,wind_speed,events
str,datetime[μs],datetime[μs],i64,str,f64,str,f64,f64,f64,str
"""Male""",2013-07-09 13:12:00,2013-07-09 14:42:00,5396,"""Canal St & Jac...",35.0,"""Millennium Par...",35.0,79.0,13.8,"""cloudy"""
"""Female""",2013-07-14 14:08:00,2013-07-14 15:53:00,6274,"""Wabash Ave & R...",19.0,"""Lake Shore Dr ...",11.0,87.1,8.1,"""partlycloudy"""
"""Female""",2013-07-21 11:35:00,2013-07-21 13:54:00,8299,"""State St & 19t...",15.0,"""Sheffield Ave ...",15.0,82.9,5.8,"""mostlycloudy"""


In [84]:
# еще можно так
bikes.filter(pl.col('tripduration') > 5000).head(3)

gender,starttime,stoptime,tripduration,from_station_name,start_capacity,to_station_name,end_capacity,temperature,wind_speed,events
str,datetime[μs],datetime[μs],i64,str,f64,str,f64,f64,f64,str
"""Male""",2013-07-09 13:12:00,2013-07-09 14:42:00,5396,"""Canal St & Jac...",35.0,"""Millennium Par...",35.0,79.0,13.8,"""cloudy"""
"""Female""",2013-07-14 14:08:00,2013-07-14 15:53:00,6274,"""Wabash Ave & R...",19.0,"""Lake Shore Dr ...",11.0,87.1,8.1,"""partlycloudy"""
"""Female""",2013-07-21 11:35:00,2013-07-21 13:54:00,8299,"""State St & 19t...",15.0,"""Sheffield Ave ...",15.0,82.9,5.8,"""mostlycloudy"""


In [85]:
# отбор строк по двум условиям
filt1 = pl.col('tripduration') > 5000
filt2 = pl.col('gender') == 'Female'
filt = filt1 & filt2
bikes.filter(filt).head(3)

gender,starttime,stoptime,tripduration,from_station_name,start_capacity,to_station_name,end_capacity,temperature,wind_speed,events
str,datetime[μs],datetime[μs],i64,str,f64,str,f64,f64,f64,str
"""Female""",2013-07-14 14:08:00,2013-07-14 15:53:00,6274,"""Wabash Ave & R...",19.0,"""Lake Shore Dr ...",11.0,87.1,8.1,"""partlycloudy"""
"""Female""",2013-07-21 11:35:00,2013-07-21 13:54:00,8299,"""State St & 19t...",15.0,"""Sheffield Ave ...",15.0,82.9,5.8,"""mostlycloudy"""
"""Female""",2013-12-28 11:37:00,2013-12-28 13:34:00,7050,"""LaSalle St & W...",15.0,"""Theater on the...",15.0,44.1,12.7,"""clear"""


In [86]:
# еще можно так
bikes.filter((pl.col('tripduration') > 5000) & 
             (pl.col('gender') == 'Female')).head(3)

gender,starttime,stoptime,tripduration,from_station_name,start_capacity,to_station_name,end_capacity,temperature,wind_speed,events
str,datetime[μs],datetime[μs],i64,str,f64,str,f64,f64,f64,str
"""Female""",2013-07-14 14:08:00,2013-07-14 15:53:00,6274,"""Wabash Ave & R...",19.0,"""Lake Shore Dr ...",11.0,87.1,8.1,"""partlycloudy"""
"""Female""",2013-07-21 11:35:00,2013-07-21 13:54:00,8299,"""State St & 19t...",15.0,"""Sheffield Ave ...",15.0,82.9,5.8,"""mostlycloudy"""
"""Female""",2013-12-28 11:37:00,2013-12-28 13:34:00,7050,"""LaSalle St & W...",15.0,"""Theater on the...",15.0,44.1,12.7,"""clear"""


In [87]:
# только одно из условий является истинным
filt = filt1 | filt2
bikes.filter(filt).head(3)

gender,starttime,stoptime,tripduration,from_station_name,start_capacity,to_station_name,end_capacity,temperature,wind_speed,events
str,datetime[μs],datetime[μs],i64,str,f64,str,f64,f64,f64,str
"""Female""",2013-07-04 15:00:00,2013-07-04 15:16:00,922,"""Lakeview Ave &...",19.0,"""Racine Ave & C...",19.0,81.0,12.7,"""mostlycloudy"""
"""Female""",2013-07-06 12:39:00,2013-07-06 12:49:00,610,"""Morgan St & La...",15.0,"""Aberdeen St & ...",15.0,82.0,5.8,"""mostlycloudy"""
"""Male""",2013-07-09 13:12:00,2013-07-09 14:42:00,5396,"""Canal St & Jac...",35.0,"""Millennium Par...",35.0,79.0,13.8,"""cloudy"""


In [88]:
# несколько условий в одном столбце events
filt = ((pl.col('events') == 'rain') |
        (pl.col('events') == 'snow') |
        (pl.col('events') == 'tstorms') |
        (pl.col('events') == 'sleet'))
bikes.filter(filt).head(3)

gender,starttime,stoptime,tripduration,from_station_name,start_capacity,to_station_name,end_capacity,temperature,wind_speed,events
str,datetime[μs],datetime[μs],i64,str,f64,str,f64,f64,f64,str
"""Male""",2013-07-15 16:43:00,2013-07-15 16:55:00,727,"""Greenwood Ave ...",15.0,"""State St & Har...",19.0,82.9,5.8,"""rain"""
"""Male""",2013-07-21 16:35:00,2013-07-21 17:06:00,1809,"""Michigan Ave &...",23.0,"""Millennium Par...",35.0,82.4,11.5,"""tstorms"""
"""Male""",2013-07-21 16:47:00,2013-07-21 17:03:00,999,"""Carpenter St &...",19.0,"""Carpenter St &...",19.0,82.4,11.5,"""tstorms"""


In [89]:
# сочетание двух фильтров
filt1 = ((pl.col('events') == 'rain') |
         (pl.col('events') == 'snow') |
         (pl.col('events') == 'tstorms') |
         (pl.col('events') == 'sleet'))
filt2 = pl.col('tripduration') > 2000
filt = filt1 & filt2
bikes.filter(filt).head(3)

gender,starttime,stoptime,tripduration,from_station_name,start_capacity,to_station_name,end_capacity,temperature,wind_speed,events
str,datetime[μs],datetime[μs],i64,str,f64,str,f64,f64,f64,str
"""Female""",2014-03-19 07:23:00,2014-03-19 08:00:00,2181,"""Seeley Ave & R...",11.0,"""Franklin St & ...",23.0,43.0,6.9,"""rain"""
"""Male""",2014-09-12 14:20:00,2014-09-12 14:57:00,2213,"""Damen Ave & Pi...",19.0,"""California Ave...",15.0,52.0,12.7,"""rain"""
"""Male""",2014-09-30 08:21:00,2014-09-30 08:58:00,2246,"""Damen Ave & Me...",11.0,"""Wood St & Tayl...",15.0,46.9,11.5,"""rain"""


In [90]:
# отберем три столбца для поездок, совершенные,
# когда шел снег или дождь
cols = ['starttime', 'temperature', 'events']
bikes.filter((pl.col('events') == 'snow') | 
             (pl.col('events') == 'rain'))[cols].head(3)

starttime,temperature,events
datetime[μs],f64,str
2013-07-15 16:43:00,82.9,"""rain"""
2013-07-26 19:10:00,66.9,"""rain"""
2013-07-30 18:53:00,69.1,"""rain"""


In [91]:
# отбираем один столбец
single_select_df = bikes.select('events')
single_select_df.head(10)

events
str
"""mostlycloudy"""
"""partlycloudy"""
"""mostlycloudy"""
"""mostlycloudy"""
"""partlycloudy"""
"""mostlycloudy"""
"""cloudy"""
"""cloudy"""
"""cloudy"""
"""mostlycloudy"""


In [92]:
# отбираем столбцы по списку
list_select_df = bikes.select(['tripduration', 'events'])
list_select_df.head(10)

tripduration,events
i64,str
993,"""mostlycloudy"""
623,"""partlycloudy"""
1040,"""mostlycloudy"""
667,"""mostlycloudy"""
130,"""partlycloudy"""
660,"""mostlycloudy"""
565,"""cloudy"""
505,"""cloudy"""
1300,"""cloudy"""
922,"""mostlycloudy"""


In [93]:
# отберем столбцы по типу
dtype_select_df = bikes.select(pl.col(pl.Int64))
dtype_select_df.head(10)

tripduration
i64
993
623
1040
667
130
660
565
505
1300
922


# Агрегирование данных

## Группировка и агрегирование с помощью одного столбца

In [94]:
# вычислим среднюю длительность поездки 
# в зависимости от погоды во время поездки
bikes.groupby('events').agg(
    pl.col('tripduration').mean().alias(
        'avg_tripduration')).sort(by='events')

events,avg_tripduration
str,f64
"""clear""",767.71824
"""cloudy""",690.291346
"""fog""",570.557377
"""hazy""",691.301724
"""mostlycloudy""",736.609963
"""partlycloudy""",725.389928
"""rain""",633.748906
"""sleet""",541.25
"""snow""",592.860515
"""tstorms""",636.160377


## Группировка и агрегирование с помощью нескольких столбцов

In [95]:
# вычислим среднюю длительность поездки 
# в зависимости от комбинации пола 
# и погоды во время поездки
bikes.groupby(['gender', 'events']).agg(
    pl.col('tripduration').mean().alias(
        'avg_tripduration')).sort(by='events')

gender,events,avg_tripduration
str,str,f64
"""Female""","""clear""",889.229955
"""Male""","""clear""",730.481688
"""Male""","""cloudy""",667.281823
"""Female""","""cloudy""",764.428671
"""Female""","""fog""",698.933333
"""Male""","""fog""",528.695652
"""Female""","""hazy""",797.823529
"""Male""","""hazy""",656.874525
"""Female""","""mostlycloudy""",819.058638
"""Male""","""mostlycloudy""",708.844771


In [96]:
# мы вычислим среднюю продолжительность поездки 
# и среднюю температуру для каждого типа
# погодного явления
bikes.groupby('events').agg(
    [
        pl.col('tripduration').mean().alias('avg_tripduration'),
        pl.col('temperature').mean().alias('avg_temp')
    ]   
).sort(by='events')

events,avg_tripduration,avg_temp
str,f64,f64
"""clear""",767.71824,59.531476
"""cloudy""",690.291346,56.621143
"""fog""",570.557377,50.235246
"""hazy""",691.301724,55.594253
"""mostlycloudy""",736.609963,67.278551
"""partlycloudy""",725.389928,65.444558
"""rain""",633.748906,57.066247
"""sleet""",541.25,31.24375
"""snow""",592.860515,26.654506
"""tstorms""",636.160377,74.200943


In [97]:
# вычислим среднюю длительность поездки,
# максимальную длительность поездки, среднюю температуру
# в зависимости типа погоды во время поездки
agg1 = bikes.groupby('events').agg(
    [ 
        pl.col('tripduration').mean().alias('avg_tripduration'),
        pl.col('temperature').mean().alias('avg_temp')
    ]   
).sort(by='events')

agg2 = bikes.groupby('events').agg(
    [
        (pl.col('tripduration').max().alias('max_tripduration')),
    ]   
).sort(by='events')
agg2 = agg2.select(pl.exclude('events'))
pl.concat([agg1, agg2], how='horizontal')

events,avg_tripduration,avg_temp,max_tripduration
str,f64,f64,i64
"""clear""",767.71824,59.531476,73591
"""cloudy""",690.291346,56.621143,86188
"""fog""",570.557377,50.235246,1776
"""hazy""",691.301724,55.594253,7739
"""mostlycloudy""",736.609963,67.278551,63155
"""partlycloudy""",725.389928,65.444558,85442
"""rain""",633.748906,57.066247,28994
"""sleet""",541.25,31.24375,1257
"""snow""",592.860515,26.654506,8309
"""tstorms""",636.160377,74.200943,2868


## Группировка с помощью сводных таблиц

In [98]:
# загружаем данные
ins = pl.read_csv('Data/StateFarm_missing.csv', sep=';')
ins.head()

Customer Lifetime Value,Coverage,Education,EmploymentStatus,Gender,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Response
f64,str,str,str,str,i64,i64,i64,i64,i64,i64,str
2763.519279,"""Basic""","""Bachelor""","""Employed""","""F""",56274,,32.0,5.0,,1.0,"""No"""
,,"""Bachelor""","""Unemployed""","""F""",0,,13.0,42.0,,,"""No"""
,,,"""Employed""","""F""",48767,108.0,,38.0,0.0,,"""No"""
7645.861827,"""Basic""","""Bachelor""",,,0,106.0,18.0,,,7.0,"""No"""
2813.692575,"""Basic""","""Bachelor""",,"""M""",43836,73.0,12.0,,,1.0,"""No"""


In [99]:
# смотрим, как варьирует средний доход клиента 
# по комбинациям пола и образования
ins.pivot(index='Education', 
          columns='Gender', 
          values='Income',
          aggregate_fn='mean')

Education,F,null,M
str,f64,f64,f64
"""Bachelor""",37972.366023,0.0,37040.25
,48767.0,,
"""College""",37740.598055,0.0,36781.846543
"""Master""",44329.25419,77026.0,45259.22
"""High School or...",36211.804149,,35891.237416
"""Doctor""",45731.160256,,39342.279503


In [100]:
# смотрим, как варьирует средний доход клиента 
# по комбинациям пола и образования,
# превратим значения в целые числа
res = ins.pivot(
    index='Education', 
    columns='Gender', 
    values='Income',
    aggregate_fn='mean')

res.select(
    [
        pl.col('Education'),
        pl.col('F').cast(pl.Int32),
        pl.col('null').cast(pl.Int32),
        pl.col('M').cast(pl.Int32),
    
    ]
)

Education,F,null,M
str,i32,i32,i32
"""Bachelor""",37972,0.0,37040.0
,48767,,
"""College""",37740,0.0,36781.0
"""Master""",44329,77026.0,45259.0
"""High School or...",36211,,35891.0
"""Doctor""",45731,,39342.0


In [101]:
# все то же самое можно получить, используя
# агрегацию с groupby
res = ins.groupby(['Gender', 'Education']).agg(
    pl.col('Income').mean().alias(
        'mean_salary'))
res.select(
    [
        pl.col('Gender'),
        pl.col('Education'),
        pl.col('mean_salary').cast(pl.Int32)    
    ]
)

Gender,Education,mean_salary
str,str,i32
"""M""","""College""",36781.0
,"""College""",0.0
"""M""","""Bachelor""",37040.0
"""M""","""Master""",45259.0
"""M""","""High School or...",35891.0
"""M""","""Doctor""",39342.0
"""F""","""Doctor""",45731.0
"""F""","""Master""",44329.0
"""F""",,48767.0
,"""Master""",77026.0


In [102]:
# смотрим максимальный доход клиента 
# по комбинациям пола и образования
ins.pivot(index='Education', 
          columns='Gender', 
          values='Income', 
          aggregate_fn='max').sort(by='Education')

Education,F,null,M
str,i64,i64,i64
,48767,,
"""Bachelor""",99803,0.0,99981.0
"""College""",99961,0.0,99816.0
"""Doctor""",98912,,99443.0
"""High School or...",99841,,99874.0
"""Master""",99875,77026.0,99960.0


In [103]:
# смотрим максимальный доход клиента 
# по комбинациям пола и образования,
# поменяли Education и Gender местами
ins.pivot(index='Gender', 
          columns='Education', 
          values='Income', 
          aggregate_fn='max')

Gender,Bachelor,null,College,Master,High School or Below,Doctor
str,i64,i64,i64,i64,i64,i64
"""F""",99803,48767.0,99961,99875,99841.0,98912.0
,0,,0,77026,,
"""M""",99981,,99816,99960,99874.0,99443.0


In [104]:
# посмотрим среднюю пожизненную ценность клиента 
# по комбинациям типа занятости и образования,
# результаты переводим в целые числа
emp_edu_mean_clv = ins.pivot(
    index='EmploymentStatus', 
    columns='Education', 
    values='Customer Lifetime Value', 
    aggregate_fn='mean')
emp_edu_mean_clv.select(
    [
        pl.col('EmploymentStatus'),
        pl.col('Bachelor').cast(pl.Int32),
        pl.col('null').cast(pl.Int32),
        pl.col('College').cast(pl.Int32),
        pl.col('Master').cast(pl.Int32),
        pl.col('High School or Below').cast(pl.Int32),
        pl.col('Doctor').cast(pl.Int32)
    ]
).sort(by='EmploymentStatus')

EmploymentStatus,Bachelor,null,College,Master,High School or Below,Doctor
str,i32,i32,i32,i32,i32,i32
,5229,7388.0,5384,,,
"""Disabled""",6729,,6756,7964.0,9983.0,7272.0
"""Employed""",8224,,8047,8042.0,8566.0,7538.0
"""Medical Leave""",7708,,6952,7834.0,8235.0,11690.0
"""Retired""",7846,,7051,12442.0,5860.0,4518.0
"""Unemployed""",7103,,7681,8807.0,7739.0,6966.0


In [105]:
# вычислим размер каждой уникальной комбинации 
# типа занятости и образования
ins.pivot(index='EmploymentStatus', 
          columns='Education',
          values='Education',
          aggregate_fn='count').sort(by='EmploymentStatus')

EmploymentStatus,Bachelor,null,College,Master,High School or Below,Doctor
str,u32,u32,u32,u32,u32,u32
,2,2.0,1,,,
"""Disabled""",114,,83,37.0,107.0,21.0
"""Employed""",1553,1.0,1499,492.0,1412.0,230.0
"""Medical Leave""",113,,133,25.0,106.0,15.0
"""Retired""",79,,91,19.0,62.0,1.0
"""Unemployed""",635,,614,86.0,710.0,50.0


In [106]:
# задаем два столбца вертикальной группировки 
# (тип занятости, пол) и один столбец 
# горизонтальной группировки (образование)
ins.pivot(index=['EmploymentStatus', 'Gender'], 
          columns='Education',
          values='Income', 
          aggregate_fn='max').sort(by=['EmploymentStatus', 'Gender'])

EmploymentStatus,Gender,Bachelor,null,College,Master,High School or Below,Doctor
str,str,i64,i64,i64,i64,i64,i64
,,0.0,,,,,
,"""F""",,,,,,
,"""M""",43836.0,,83689.0,,,
"""Disabled""","""F""",29633.0,,29958.0,29981.0,28672.0,29950.0
"""Disabled""","""M""",28898.0,,28617.0,28245.0,29295.0,29808.0
"""Employed""",,,,,77026.0,,
"""Employed""","""F""",99803.0,48767.0,99961.0,99875.0,99841.0,98912.0
"""Employed""","""M""",99981.0,,99816.0,99960.0,99874.0,99443.0
"""Medical Leave""","""F""",29957.0,,29539.0,27229.0,29658.0,26463.0
"""Medical Leave""","""M""",29926.0,,29723.0,26840.0,29664.0,23053.0


In [107]:
# зададим один столбец вертикальной группировки
# (тип занятости) и два столбца горизонтальной 
# группировки (пол и образование)
ins.pivot(index='EmploymentStatus', 
          columns=['Gender', 'Education'],
          values='Income', 
          aggregate_fn='max').sort(by='EmploymentStatus')

EmploymentStatus,F,null,M,Bachelor,null,College,Master,High School or Below,Doctor
str,i64,i64,i64,i64,i64.1,i64,i64,i64,i64
,,0.0,83689,43836,,83689,,,
"""Disabled""",29981.0,,29808,29633,,29958,29981.0,29295.0,29950.0
"""Employed""",99961.0,77026.0,99981,99981,48767.0,99961,99960.0,99874.0,99443.0
"""Medical Leave""",29957.0,,29926,29957,,29723,27229.0,29664.0,26463.0
"""Retired""",28321.0,,29692,29465,,29692,26161.0,28140.0,19186.0
"""Unemployed""",0.0,0.0,0,0,,0,0.0,0.0,0.0


In [108]:
# зададим два агрегируемых столбца
# (пожизненная ценность клиента и доход)
ins.pivot(
    index='Education', 
    columns='Gender',
    values=['Income', 'Customer Lifetime Value'],
    aggregate_fn='mean').sort(by='Education')

Education,F,null,M,F,null,M
str,f64,f64,f64,f64.1,f64.1,f64.1
,48767.0,,,,7388.178085,
"""Bachelor""",37972.366023,0.0,37040.25,7980.275871,7645.861827,7675.774393
"""College""",37740.598055,0.0,36781.846543,7757.306941,,7868.564624
"""Doctor""",45731.160256,,39342.279503,7530.432422,,7702.351623
"""High School or...",36211.804149,,35891.237416,8504.800424,,8093.913831
"""Master""",44329.25419,77026.0,45259.22,8457.905556,8798.797003,8015.928886


# Импутация пропусков и замена значений

In [109]:
# загружаем данные
df = pl.read_csv('Data/titanic_train.csv')
df.head(10)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Ow...","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. ...","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Mis...","""female""",26.0,0,0,"""STON/O2. 31012...",7.925,,"""S"""
4,1,1,"""Futrelle, Mrs....","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
5,0,3,"""Allen, Mr. Wil...","""male""",35.0,0,0,"""373450""",8.05,,"""S"""
6,0,3,"""Moran, Mr. Jam...","""male""",,0,0,"""330877""",8.4583,,"""Q"""
7,0,1,"""McCarthy, Mr. ...","""male""",54.0,0,0,"""17463""",51.8625,"""E46""","""S"""
8,0,3,"""Palsson, Maste...","""male""",2.0,3,1,"""349909""",21.075,,"""S"""
9,1,3,"""Johnson, Mrs. ...","""female""",27.0,0,2,"""347742""",11.1333,,"""S"""
10,1,2,"""Nasser, Mrs. N...","""female""",14.0,1,0,"""237736""",30.0708,,"""C"""


In [110]:
# выведем количество пропусков по каждому столбцу
df.select(
    pl.col(df.columns).is_null().sum()  
)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,177,0,0,0,0,687,2


In [111]:
# еще можно так
for col in df.get_columns():
    print(f"{col.name} - {col.is_null().sum()}")

PassengerId - 0
Survived - 0
Pclass - 0
Name - 0
Sex - 0
Age - 177
SibSp - 0
Parch - 0
Ticket - 0
Fare - 0
Cabin - 687
Embarked - 2


In [112]:
# пропуски можно заменить средним
df.select(
    pl.col('Age').fill_null(strategy='mean')
).head(10)

Age
f64
22.0
38.0
26.0
35.0
35.0
29.699118
54.0
2.0
27.0
14.0


In [113]:
# пропуски можно заменить нулем
df.select(
    pl.col('Age').fill_null(strategy='zero')
).head(10)

Age
f64
22.0
38.0
26.0
35.0
35.0
0.0
54.0
2.0
27.0
14.0


In [114]:
# пропуски можно заменить значением
# вне диапазона -999
df.select(
    pl.col('Age').fill_null(value=-999)
).head(10)

Age
f64
22.0
38.0
26.0
35.0
35.0
-999.0
54.0
2.0
27.0
14.0


In [115]:
# выполняем импутацию и записываем результаты
df = df.with_columns(
    [
        pl.col('Cabin').fill_null(value='C85'),
        pl.col('Age').fill_null(value=-999),
        pl.col('Embarked').fill_null(value=pl.col('Embarked').mode())
    ]
)

In [116]:
# выведем количество пропусков по каждому столбцу
df.select(
    pl.col(df.columns).is_null().sum()  
)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,0,0


In [117]:
# взглянем на переменную Age
df.select(pl.col('Age')).head(10)

Age
f64
22.0
38.0
26.0
35.0
35.0
-999.0
54.0
2.0
27.0
14.0


In [118]:
# список значений, которые нужно заменить
from_ = [35, -999]
# список значений, на которые нужно заменить
to_ = [25, 0]

In [119]:
# пишем функцию замены значений
def replace(column, from_, to_):
    branch = pl.when(pl.col(column) == from_[0]).then(to_[0])

    for (from_value, to_value) in zip(from_, to_):
        branch = branch.when(pl.col(column) == from_value).then(to_value)

    return branch.otherwise(pl.col(column)).alias(column)

In [120]:
# проиллюстрируем замены
df.select(replace('Age', from_, to_)).head(10)

Age
f64
22.0
38.0
26.0
25.0
25.0
0.0
54.0
2.0
27.0
14.0


In [121]:
# запишем изменения
df = df.with_column(replace('Age', from_, to_))

# Манипуляции с датафреймами

In [122]:
# выполним конкатенацию датафреймов по вертикали
df_v1 = pl.DataFrame(
    {
        'a': [1],
        'b': [3],
    }
)

df_v2 = pl.DataFrame(
    {
        'a': [2],
        'b': [4],
    }
)

df_vertical_concat = pl.concat(
    [
        df_v1,
        df_v2,
    ],
    how='vertical'
)

display(df_v1, df_v2, df_vertical_concat)

a,b
i64,i64
1,3


a,b
i64,i64
2,4


a,b
i64,i64
1,3
2,4


In [123]:
# выполним конкатенацию датафреймов по горизонтали
df_h1 = pl.DataFrame(
    {
        'l1': [1, 2],
        'l2': [3, 4],
    }
)

df_h2 = pl.DataFrame(
    {
        'r1': [5, 6],
        'r2': [7, 8],
        'r3': [9, 10],
    }
)

df_horizontal_concat = pl.concat(
    [
        df_h1,
        df_h2,
    ],
    how='horizontal'
)

display(df_h1, df_h2, df_horizontal_concat)

l1,l2
i64,i64
1,3
2,4


r1,r2,r3
i64,i64,i64
5,7,9
6,8,10


l1,l2,r1,r2,r3
i64,i64,i64,i64,i64
1,3,5,7,9
2,4,6,8,10


In [124]:
# выполним конкатенацию датафреймов по диагонали
df_d1 = pl.DataFrame(
    {
        'a': [1],
        'b': [3],
    }
)

df_d2 = pl.DataFrame(
    {
        'a': [2],
        'd': [4],
    }
)

df_diagonal_concat = pl.concat(
    [
        df_d1,
        df_d2,
    ],
    how='diagonal',
)

display(df_d1, df_d2, df_diagonal_concat)

a,b
i64,i64
1,3


a,d
i64,i64
2,4


a,b,d
i64,i64,i64
1,3.0,
2,,4.0


In [125]:
# датафрейм - идентификатор и марка машины
df_cars = pl.DataFrame(
    {
        'id': ['a', 'b', 'c'],
        'make': ['ford', 'toyota', 'bmw'],
    }
)
df_cars

id,make
str,str
"""a""","""ford"""
"""b""","""toyota"""
"""c""","""bmw"""


In [126]:
# датафрейм - идентификатор и стоимость ремонта
df_repairs = pl.DataFrame(
    {
        'id': ['c', 'c'],
        'cost': [100, 200],
    }
)
df_repairs

id,cost
str,i64
"""c""",100
"""c""",200


In [127]:
# применим inner join
df_inner_join = df_cars.join(df_repairs, on='id', how='inner')
df_inner_join

id,make,cost
str,str,i64
"""c""","""bmw""",100
"""c""","""bmw""",200


In [128]:
# применим outer join
df_outer_join = df_cars.join(df_repairs, on='id', how='outer')
df_outer_join

id,make,cost
str,str,i64
"""a""","""ford""",
"""b""","""toyota""",
"""c""","""bmw""",100.0
"""c""","""bmw""",200.0


In [129]:
# применим semi join
df_semi_join = df_cars.join(df_repairs, on='id', how='semi')
df_semi_join

id,make
str,str
"""c""","""bmw"""


In [130]:
# применим anti join
df_anti_join = df_cars.join(df_repairs, on='id', how='anti')
df_anti_join

id,make
str,str
"""a""","""ford"""
"""b""","""toyota"""


In [131]:
# загружаем набор с 4 рядами
data = pl.read_csv('Data/example_dataset.csv', 
                   parse_dates=True)
data.head()

timestamp,segment,target
date,str,i64
2019-01-01,"""segment_a""",170
2019-01-02,"""segment_a""",243
2019-01-03,"""segment_a""",267
2019-01-04,"""segment_a""",287
2019-01-05,"""segment_a""",279


In [132]:
# разворачиваем строки переменной segment обучающего набора
# в столбцы, значениями столбцов будут значения
# зависимой переменной target
pivot_data = data.pivot(index='timestamp', 
                        columns='segment', 
                        values='target')
pivot_data.head()

timestamp,segment_a,segment_b,segment_c,segment_d
date,i64,i64,i64,i64
2019-01-01,170,102,92,238
2019-01-02,243,123,107,358
2019-01-03,267,130,103,366
2019-01-04,287,138,103,385
2019-01-05,279,137,104,384


In [133]:
# "расплавляем" датафрейм
melt_data = pivot_data.melt(
    id_vars='timestamp', 
    variable_name='segment',
    value_name='target')
melt_data.head()

timestamp,segment,target
date,str,i64
2019-01-01,"""segment_a""",170
2019-01-02,"""segment_a""",243
2019-01-03,"""segment_a""",267
2019-01-04,"""segment_a""",287
2019-01-05,"""segment_a""",279


# Больше материалов в рамках подписки вы найдете по адресу https://boosty.to/gewissta