In [2]:
import pyarrow as pa
import pyarrow.compute as pc
from pyarrow import csv
import pandas as pd

days = pa.array([1, 12, 17, 23, 28], type=pa.int8())

months = pa.array([1, 3, 5, 7, 1], type=pa.int8())

years = pa.array([1990, 2000, 1995, 2000, 1995], type=pa.int16())

birthdays_table = pa.table([days, months, years],
                           names=["days", "months", "years"])

print(birthdays_table)
print(pc.value_counts(birthdays_table["years"]))

pyarrow.Table
days: int8
months: int8
years: int16
----
days: [[1,12,17,23,28]]
months: [[1,3,5,7,1]]
years: [[1990,2000,1995,2000,1995]]
-- is_valid: all not null
-- child 0 type: int16
  [
    1990,
    2000,
    1995
  ]
-- child 1 type: int64
  [
    1,
    2,
    2
  ]


# Read files with regular Pandas
to run code below download dataset from https://www.kaggle.com/datasets/antonkozyriev/game-recommendations-on-steam

In [5]:
%%timeit
pd.read_csv("recommendations.csv")


24.6 s ± 814 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
%%timeit
pd.read_csv("games.csv")

103 ms ± 3.28 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [4]:
pd_rec = pd.read_csv("recommendations.csv")
pd_games = pd.read_csv("games.csv")

In [5]:
pd_rec.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37839599 entries, 0 to 37839598
Data columns (total 8 columns):
 #   Column          Dtype  
---  ------          -----  
 0   app_id          int64  
 1   helpful         int64  
 2   funny           int64  
 3   date            object 
 4   is_recommended  bool   
 5   hours           float64
 6   user_id         int64  
 7   review_id       int64  
dtypes: bool(1), float64(1), int64(5), object(1)
memory usage: 2.0+ GB


In [6]:
pd_games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50564 entries, 0 to 50563
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   app_id          50564 non-null  int64  
 1   title           50564 non-null  object 
 2   date_release    50564 non-null  object 
 3   win             50564 non-null  bool   
 4   mac             50564 non-null  bool   
 5   linux           50564 non-null  bool   
 6   rating          50564 non-null  object 
 7   positive_ratio  50564 non-null  int64  
 8   user_reviews    50564 non-null  int64  
 9   price_final     50564 non-null  float64
 10  price_original  50564 non-null  float64
 11  discount        50564 non-null  float64
 12  steam_deck      50564 non-null  bool   
dtypes: bool(4), float64(3), int64(3), object(3)
memory usage: 3.7+ MB


# Run statistics on regular Pandas

In [7]:
%%timeit
(
    pd
    .merge(pd_rec, pd_games, on="app_id", how="inner")
    .groupby(['title'])[['title', 'user_reviews']]
    .max()
    .sort_values(['user_reviews'], ascending=False)
)



25.2 s ± 1.24 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
pd_result = pd.merge(pd_rec, pd_games, on="app_id", how="inner")
pd_result.groupby(['title'])[['user_reviews']].max().sort_values(['user_reviews'], ascending=False)

Unnamed: 0_level_0,user_reviews
title,Unnamed: 1_level_1
Counter-Strike: Global Offensive,7297791
PUBG: BATTLEGROUNDS,2187691
Dota 2,1998934
Grand Theft Auto V,1431104
Tom Clancy's Rainbow Six® Siege,966803
...,...
Dragon Must Die,10
Gatlin',10
The Chronicles of King Arthur - Episode 1: Excalibur,10
Rich Taste of Ecchi,10


# Read filed with Arrow Pandas

In [3]:
%%timeit
pd.read_csv("recommendations.csv", dtype_backend="pyarrow", engine="pyarrow")

4.32 s ± 2.09 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [4]:
%%timeit
pd.read_csv("games.csv", dtype_backend="pyarrow", engine="pyarrow")

16.4 ms ± 3.14 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [11]:
pa_rec = pd.read_csv("recommendations.csv", dtype_backend="pyarrow", engine="pyarrow")
pa_games = pd.read_csv("games.csv", dtype_backend="pyarrow", engine="pyarrow")

In [12]:
pa_rec.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37839599 entries, 0 to 37839598
Data columns (total 8 columns):
 #   Column          Dtype               
---  ------          -----               
 0   app_id          int64[pyarrow]      
 1   helpful         int64[pyarrow]      
 2   funny           int64[pyarrow]      
 3   date            date32[day][pyarrow]
 4   is_recommended  bool[pyarrow]       
 5   hours           double[pyarrow]     
 6   user_id         int64[pyarrow]      
 7   review_id       int64[pyarrow]      
dtypes: bool[pyarrow](1), date32[day][pyarrow](1), double[pyarrow](1), int64[pyarrow](5)
memory usage: 1.8 GB


In [13]:
pa_games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50564 entries, 0 to 50563
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype               
---  ------          --------------  -----               
 0   app_id          50564 non-null  int64[pyarrow]      
 1   title           50564 non-null  string[pyarrow]     
 2   date_release    50564 non-null  date32[day][pyarrow]
 3   win             50564 non-null  bool[pyarrow]       
 4   mac             50564 non-null  bool[pyarrow]       
 5   linux           50564 non-null  bool[pyarrow]       
 6   rating          50564 non-null  string[pyarrow]     
 7   positive_ratio  50564 non-null  int64[pyarrow]      
 8   user_reviews    50564 non-null  int64[pyarrow]      
 9   price_final     50564 non-null  double[pyarrow]     
 10  price_original  50564 non-null  double[pyarrow]     
 11  discount        50564 non-null  double[pyarrow]     
 12  steam_deck      50564 non-null  bool[pyarrow]       
dtypes: bool[pyarrow]

# Run statistics on Arrow Pandas

In [14]:
%%timeit
(
    pd
    .merge(pa_rec, pa_games, on="app_id", how="inner")
    .groupby(['title'])[['title', 'user_reviews']]
    .max()
    .sort_values(['user_reviews'], ascending=False)
)

20.6 s ± 668 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
pa_result = pd.merge(pa_rec, pa_games, on="app_id", how="inner")
pa_result.groupby(['title'])[['user_reviews']].max().sort_values(['user_reviews'], ascending=False)

Unnamed: 0_level_0,user_reviews
title,Unnamed: 1_level_1
Counter-Strike: Global Offensive,7297791
PUBG: BATTLEGROUNDS,2187691
Dota 2,1998934
Grand Theft Auto V,1431104
Tom Clancy's Rainbow Six® Siege,966803
...,...
重装燎原,10
雾尽时分,10
그랑 엠파이어 : 아마네,10
𝄢Gaia's Melody II: ECHOED MEMORIES,10


# Comparison

In [20]:
run_results = {
    "arrow": [2.16 * 1000, 6.85, 20.6 * 1000],
    "pandas": [26.6 * 1000, 95.3, 25.2 * 1000],
    "columns": ["rec_read", "game_read", "stats"]
}


In [21]:
import plotly.express as px
fig = px.bar(
    run_results, x="columns", y=["arrow", "pandas"],
             barmode='group', height=400
             )
fig.show()