# Polars: The Super Fast Dataframe Library for Python ... bye bye Pandas?

[Polars: The Super Fast Dataframe Library for Python ... bye bye Pandas?](https://www.youtube.com/watch?v=CByx7XjYMhw) <br> To install polars: https://pypi.org/project/polars/

In [1]:
# import polars
import polars as pl

In [2]:
# read csv file
df = pl.read_csv("../data/Polars/StudentsPerformance.csv")

In [3]:
df

id,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
i64,str,str,str,str,str,i64,i64,i64
1,"""female""","""group B""","""bachelor's deg...","""standard""","""none""",72,72,74
2,"""female""","""group C""","""some college""","""standard""","""completed""",69,90,88
3,"""female""","""group B""","""master's degre...","""standard""","""none""",90,95,93
4,"""male""","""group A""","""associate's de...","""free/reduced""","""none""",47,57,44
5,"""male""","""group C""","""some college""","""standard""","""none""",76,78,75
6,"""female""","""group B""","""associate's de...","""standard""","""none""",71,83,78
7,"""female""","""group B""","""some college""","""standard""","""completed""",88,95,92
8,"""male""","""group B""","""some college""","""free/reduced""","""none""",40,43,39
9,"""male""","""group D""","""high school""","""free/reduced""","""completed""",64,64,67
10,"""female""","""group B""","""high school""","""free/reduced""","""none""",38,60,50


In [4]:
# get column names
df.columns

['id',
 'gender',
 'race/ethnicity',
 'parental level of education',
 'lunch',
 'test preparation course',
 'math score',
 'reading score',
 'writing score']

## 1 Select Columns

In [5]:
# Select 1 column
df.select(pl.col('gender'))

gender
str
"""female"""
"""female"""
"""female"""
"""male"""
"""male"""
"""female"""
"""female"""
"""male"""
"""male"""
"""female"""


In [6]:

# Select 2 column
df.select(pl.col('gender', 'math score'))

gender,math score
str,i64
"""female""",72
"""female""",69
"""female""",90
"""male""",47
"""male""",76
"""female""",71
"""female""",88
"""male""",40
"""male""",64
"""female""",38


In [7]:
# Select all column
df.select(pl.col('*'))

id,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
i64,str,str,str,str,str,i64,i64,i64
1,"""female""","""group B""","""bachelor's deg...","""standard""","""none""",72,72,74
2,"""female""","""group C""","""some college""","""standard""","""completed""",69,90,88
3,"""female""","""group B""","""master's degre...","""standard""","""none""",90,95,93
4,"""male""","""group A""","""associate's de...","""free/reduced""","""none""",47,57,44
5,"""male""","""group C""","""some college""","""standard""","""none""",76,78,75
6,"""female""","""group B""","""associate's de...","""standard""","""none""",71,83,78
7,"""female""","""group B""","""some college""","""standard""","""completed""",88,95,92
8,"""male""","""group B""","""some college""","""free/reduced""","""none""",40,43,39
9,"""male""","""group D""","""high school""","""free/reduced""","""completed""",64,64,67
10,"""female""","""group B""","""high school""","""free/reduced""","""none""",38,60,50


## 2 Create columns

In [8]:
df.head()

id,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
i64,str,str,str,str,str,i64,i64,i64
1,"""female""","""group B""","""bachelor's deg...","""standard""","""none""",72,72,74
2,"""female""","""group C""","""some college""","""standard""","""completed""",69,90,88
3,"""female""","""group B""","""master's degre...","""standard""","""none""",90,95,93
4,"""male""","""group A""","""associate's de...","""free/reduced""","""none""",47,57,44
5,"""male""","""group C""","""some college""","""standard""","""none""",76,78,75


In [9]:
# create a new column "sum" by summing 'math score' and reading score'
df.with_columns(
    (pl.col('math score') + pl.col('reading score')).alias('sum')   )

# pandas: df['sum'] = df['math score'] + df['reading score']

id,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,sum
i64,str,str,str,str,str,i64,i64,i64,i64
1,"""female""","""group B""","""bachelor's deg...","""standard""","""none""",72,72,74,144
2,"""female""","""group C""","""some college""","""standard""","""completed""",69,90,88,159
3,"""female""","""group B""","""master's degre...","""standard""","""none""",90,95,93,185
4,"""male""","""group A""","""associate's de...","""free/reduced""","""none""",47,57,44,104
5,"""male""","""group C""","""some college""","""standard""","""none""",76,78,75,154
6,"""female""","""group B""","""associate's de...","""standard""","""none""",71,83,78,154
7,"""female""","""group B""","""some college""","""standard""","""completed""",88,95,92,183
8,"""male""","""group B""","""some college""","""free/reduced""","""none""",40,43,39,83
9,"""male""","""group D""","""high school""","""free/reduced""","""completed""",64,64,67,128
10,"""female""","""group B""","""high school""","""free/reduced""","""none""",38,60,50,98


## 3 Filter

In [10]:
df

id,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
i64,str,str,str,str,str,i64,i64,i64
1,"""female""","""group B""","""bachelor's deg...","""standard""","""none""",72,72,74
2,"""female""","""group C""","""some college""","""standard""","""completed""",69,90,88
3,"""female""","""group B""","""master's degre...","""standard""","""none""",90,95,93
4,"""male""","""group A""","""associate's de...","""free/reduced""","""none""",47,57,44
5,"""male""","""group C""","""some college""","""standard""","""none""",76,78,75
6,"""female""","""group B""","""associate's de...","""standard""","""none""",71,83,78
7,"""female""","""group B""","""some college""","""standard""","""completed""",88,95,92
8,"""male""","""group B""","""some college""","""free/reduced""","""none""",40,43,39
9,"""male""","""group D""","""high school""","""free/reduced""","""completed""",64,64,67
10,"""female""","""group B""","""high school""","""free/reduced""","""none""",38,60,50


In [11]:
# Simple filtering
df.filter(pl.col('gender')=='female')

# pandas: df[df['gender'] == 'female']

id,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
i64,str,str,str,str,str,i64,i64,i64
1,"""female""","""group B""","""bachelor's deg...","""standard""","""none""",72,72,74
2,"""female""","""group C""","""some college""","""standard""","""completed""",69,90,88
3,"""female""","""group B""","""master's degre...","""standard""","""none""",90,95,93
6,"""female""","""group B""","""associate's de...","""standard""","""none""",71,83,78
7,"""female""","""group B""","""some college""","""standard""","""completed""",88,95,92
10,"""female""","""group B""","""high school""","""free/reduced""","""none""",38,60,50
13,"""female""","""group B""","""high school""","""standard""","""none""",65,81,73
15,"""female""","""group A""","""master's degre...","""standard""","""none""",50,53,58
16,"""female""","""group C""","""some high scho...","""standard""","""none""",69,75,78
18,"""female""","""group B""","""some high scho...","""free/reduced""","""none""",18,32,28


In [12]:
# Multiple filtering
df.filter(
    (pl.col('gender') == 'female') &
    (pl.col('race/ethnicity') == 'group B')
)

# pandas: df[(df['gender'] == 'female') & (df['race/ethnicity'] == 'group B')]

id,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
i64,str,str,str,str,str,i64,i64,i64
1,"""female""","""group B""","""bachelor's deg...","""standard""","""none""",72,72,74
3,"""female""","""group B""","""master's degre...","""standard""","""none""",90,95,93
6,"""female""","""group B""","""associate's de...","""standard""","""none""",71,83,78
7,"""female""","""group B""","""some college""","""standard""","""completed""",88,95,92
10,"""female""","""group B""","""high school""","""free/reduced""","""none""",38,60,50
13,"""female""","""group B""","""high school""","""standard""","""none""",65,81,73
18,"""female""","""group B""","""some high scho...","""free/reduced""","""none""",18,32,28
22,"""female""","""group B""","""some college""","""free/reduced""","""completed""",65,75,70
32,"""female""","""group B""","""some college""","""standard""","""none""",63,65,61
43,"""female""","""group B""","""associate's de...","""standard""","""none""",53,58,65


## 4 Group By

In [13]:
# Group by
df.groupby("race/ethnicity").count()


race/ethnicity,count
str,u32
"""group B""",190
"""group E""",140
"""group A""",89
"""group C""",319
"""group D""",262


## 5 Join df

In [14]:
df2 = pl.read_csv("../data/Polars/LanguageScore.csv")


In [15]:
# Join dataframes
df.join(df2, on='id')

# pandas: pd.merge(df, df2, on='id', how='left')

id,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,language score
i64,str,str,str,str,str,i64,i64,i64,i64
1,"""female""","""group B""","""bachelor's deg...","""standard""","""none""",72,72,74,74
2,"""female""","""group C""","""some college""","""standard""","""completed""",69,90,88,67
3,"""female""","""group B""","""master's degre...","""standard""","""none""",90,95,93,34
4,"""male""","""group A""","""associate's de...","""free/reduced""","""none""",47,57,44,33
5,"""male""","""group C""","""some college""","""standard""","""none""",76,78,75,75
6,"""female""","""group B""","""associate's de...","""standard""","""none""",71,83,78,51
7,"""female""","""group B""","""some college""","""standard""","""completed""",88,95,92,95
8,"""male""","""group B""","""some college""","""free/reduced""","""none""",40,43,39,92
9,"""male""","""group D""","""high school""","""free/reduced""","""completed""",64,64,67,56
10,"""female""","""group B""","""high school""","""free/reduced""","""none""",38,60,50,60


In [16]:
# Inner, left and outer join
df.join(df2, on='id', how='inner')
df.join(df2, on='id', how='left')
df.join(df2, on='id', how='outer')

id,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,language score
i64,str,str,str,str,str,i64,i64,i64,i64
1,"""female""","""group B""","""bachelor's deg...","""standard""","""none""",72,72,74,74
2,"""female""","""group C""","""some college""","""standard""","""completed""",69,90,88,67
3,"""female""","""group B""","""master's degre...","""standard""","""none""",90,95,93,34
4,"""male""","""group A""","""associate's de...","""free/reduced""","""none""",47,57,44,33
5,"""male""","""group C""","""some college""","""standard""","""none""",76,78,75,75
6,"""female""","""group B""","""associate's de...","""standard""","""none""",71,83,78,51
7,"""female""","""group B""","""some college""","""standard""","""completed""",88,95,92,95
8,"""male""","""group B""","""some college""","""free/reduced""","""none""",40,43,39,92
9,"""male""","""group D""","""high school""","""free/reduced""","""completed""",64,64,67,56
10,"""female""","""group B""","""high school""","""free/reduced""","""none""",38,60,50,60


## 6 Concat

In [17]:
df = pl.read_csv("../data/Polars/StudentsPerformance.csv")
df2 = pl.read_csv("../data/Polars/LanguageScore.csv")

In [18]:
# Concatenate df and df2
pl.concat([df, df2], how="horizontal")

DuplicateError: Cannot do hstack operation. Column with name: id already exists

In [19]:
# drop column "id" in df2
df2 = df2.drop("id")

# Concatenate dataframes
pl.concat([df, df2], how="horizontal")

id,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,language score
i64,str,str,str,str,str,i64,i64,i64,i64
1,"""female""","""group B""","""bachelor's deg...","""standard""","""none""",72,72,74,74
2,"""female""","""group C""","""some college""","""standard""","""completed""",69,90,88,67
3,"""female""","""group B""","""master's degre...","""standard""","""none""",90,95,93,34
4,"""male""","""group A""","""associate's de...","""free/reduced""","""none""",47,57,44,33
5,"""male""","""group C""","""some college""","""standard""","""none""",76,78,75,75
6,"""female""","""group B""","""associate's de...","""standard""","""none""",71,83,78,51
7,"""female""","""group B""","""some college""","""standard""","""completed""",88,95,92,95
8,"""male""","""group B""","""some college""","""free/reduced""","""none""",40,43,39,92
9,"""male""","""group D""","""high school""","""free/reduced""","""completed""",64,64,67,56
10,"""female""","""group B""","""high school""","""free/reduced""","""none""",38,60,50,60


## Additional Speed Check

In [20]:
import numpy as np

In [34]:
pl.Series.rand(1000, pl.Int64, 1, 101)


AttributeError: type object 'Series' has no attribute 'rand'