In [2]:
import pandas as pd
import numpy as np

bios = pd.read_csv('./data/bios.csv')
bios['born_date'] = pd.to_datetime(bios['born_date'])
results = pd.read_parquet('./data/results.parquet')
coffee = pd.read_csv('https://raw.githubusercontent.com/KeithGalli/complete-pandas-tutorial/refs/heads/master/warmup-data/coffee.csv')
coffee['price'] = np.where(coffee['Coffee Type']=='Espresso',3.99,5.99)
coffee['revenue'] = coffee['price'] * coffee['Units Sold']

In [12]:
#pivot
pivot = coffee.pivot(columns='Coffee Type', index='Day',values='revenue')
pivot
pivot.loc['Monday','Latte']
pivot.sum()
pivot.sum(axis=1)

Day
Friday       389.2
Monday       189.6
Saturday     389.2
Sunday       389.2
Thursday     339.3
Tuesday      239.5
Wednesday    289.4
dtype: float64

In [18]:
# groupby plus counting
bios.groupby(bios['born_date'].dt.year)['name']
bios.groupby(bios['born_date'].dt.year)['name'].count().reset_index().sort_values('name',ascending=False)
bios['month_born'] = bios['born_date'].dt.month
bios['year_born'] = bios['born_date'].dt.year
bios.groupby([bios['year_born'],bios['month_born']])['name'].count().reset_index().sort_values('name',ascending=False)

Unnamed: 0,year_born,month_born,name
1437,1970.0,1.0,239
1461,1972.0,1.0,229
1629,1986.0,1.0,227
1497,1975.0,1.0,227
1617,1985.0,1.0,225
...,...,...,...
95,1857.0,5.0,1
96,1857.0,7.0,1
97,1857.0,8.0,1
98,1857.0,9.0,1


In [30]:
#shift function
coffee['yestarday_revenue'] = coffee['revenue'].shift(2)
coffee['pct_change'] = coffee['revenue'] / coffee['yestarday_revenue'] * 100
#you can also shift backwards
# coffee['yestarday_revenue'] = coffee['revenue'].shift(-2)

coffee


Unnamed: 0,Day,Coffee Type,Units Sold,price,revenue,yestarday_revenue,pct_change
0,Monday,Espresso,25,3.99,99.75,,
1,Monday,Latte,15,5.99,89.85,,
2,Tuesday,Espresso,30,3.99,119.7,99.75,120.0
3,Tuesday,Latte,20,5.99,119.8,89.85,133.333333
4,Wednesday,Espresso,35,3.99,139.65,119.7,116.666667
5,Wednesday,Latte,25,5.99,149.75,119.8,125.0
6,Thursday,Espresso,40,3.99,159.6,139.65,114.285714
7,Thursday,Latte,30,5.99,179.7,149.75,120.0
8,Friday,Espresso,45,3.99,179.55,159.6,112.5
9,Friday,Latte,35,5.99,209.65,179.7,116.666667


In [32]:
bios['height_rank'] = bios['height_cm'].rank()
bios.sort_values(['height_rank'],ascending=False)

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date,height_rank
89070,89782,Yao Ming,1980-09-12,Xuhui District,Shanghai,CHN,People's Republic of China,226.0,141.0,,106651.0
5781,5804,Tommy Burleson,1952-02-24,Crossnore,North Carolina,USA,United States,223.0,102.0,,106649.5
6978,7013,Arvydas Sabonis,1964-12-19,Kaunas,Kaunas,LTU,Lithuania Soviet Union,223.0,122.0,,106649.5
89075,89787,Roberto Dueñas,1975-11-01,Madrid,Madrid,ESP,Spain,221.0,137.0,,106647.0
120266,122147,Zhang Zhaoxu,1987-11-18,Binzhou,Shandong,CHN,People's Republic of China,221.0,110.0,,106647.0
...,...,...,...,...,...,...,...,...,...,...,...
145490,149217,Sin Ye-Chan,1995-06-13,,,,Republic of Korea,,,,
145491,149218,Matthew Wepke,1989-12-05,,,,Jamaica,,,,
145492,149219,Carlos García-Ordóñez,1927-04-24,La Habana (Havana),Ciudad de La Habana,CUB,Cuba,,,2019-11-24,
145493,149220,Landysh Falyakhova,1998-08-31,Dva Polya Artash,Respublika Tatarstan,RUS,ROC,,,,


In [35]:
bios['height_rank'] = bios['height_cm'].rank(method='min', ascending=False)

In [44]:
#cumulative sum
coffee.select_dtypes(['int','float']).cumsum()
#cumulative sum for one column
coffee['cumulative_revenue'] = coffee['revenue'].cumsum()
#cumulative sum over last 3 days
coffee['cumulative_revenue_3_days'] = coffee['revenue'].rolling(3).sum()


In [50]:
#to check package version
pd.__version__

'2.0.0'

In [53]:
# difference between numpy and pyarrow
results_numpy = pd.read_csv('./data/results.csv')
results_arrow = pd.read_csv('./data/results.csv', engine='pyarrow', dtype_backend='pyarrow')

In [56]:
#arrow is using strings instead of objects which is more memory efficient
#article to learn more https://datapythonista.me/blog/pandas-20-and-the-arrow-revolution-part-i
results_arrow.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308408 entries, 0 to 308407
Data columns (total 11 columns):
 #   Column      Non-Null Count   Dtype          
---  ------      --------------   -----          
 0   year        305807 non-null  double[pyarrow]
 1   type        308408 non-null  string[pyarrow]
 2   discipline  308408 non-null  string[pyarrow]
 3   event       308408 non-null  string[pyarrow]
 4   as          308408 non-null  string[pyarrow]
 5   athlete_id  308408 non-null  int64[pyarrow] 
 6   noc         308408 non-null  string[pyarrow]
 7   team        308408 non-null  string[pyarrow]
 8   place       283193 non-null  double[pyarrow]
 9   tied        308408 non-null  bool[pyarrow]  
 10  medal       308408 non-null  string[pyarrow]
dtypes: bool[pyarrow](1), double[pyarrow](2), int64[pyarrow](1), string[pyarrow](7)
memory usage: 37.4 MB


In [57]:
results_numpy['as'].str.contains('Kamil')

0         False
1         False
2         False
3         False
4         False
          ...  
308403    False
308404    False
308405    False
308406    False
308407    False
Name: as, Length: 308408, dtype: bool

In [58]:
results_arrow['as'].str.contains('Kamil')

0         False
1         False
2         False
3         False
4         False
          ...  
308403    False
308404    False
308405    False
308406    False
308407    False
Name: as, Length: 308408, dtype: bool[pyarrow]

In [61]:
filtered_bios = bios[(bios['born_region']== 'Slaskie') & (bios['height_cm'] > 180)]
filtered_bios

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date


In [None]:
bios[(bios['born_country']=='POL')|]

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
217,218,Kálmán Kirchmayer,1897-11-15,Rzeszów,Podkarpackie,POL,Hungary,,,1990-11-06
503,505,Ursula Büschking,1941-09-23,Braniewo,Warmińsko-Mazurskie,POL,West Germany,169.0,55.0,
515,517,Willi Gabriel,1939-09-24,Owiesno,Dolnośląskie,POL,West Germany,165.0,63.0,
1061,1068,Bożena Bąk,1966-01-28,Głubczyce,Opolskie,POL,Poland,180.0,70.0,
1062,1069,Magdalena Grzybowska,1978-11-22,Poznań,Wielkopolskie,POL,Poland,184.0,66.0,
...,...,...,...,...,...,...,...,...,...,...
145032,148744,Marek Kania,1999-04-02,Warszawa (Warsaw),Mazowieckie,POL,Poland,,,
145033,148745,Damian Żurek,1999-09-17,Tomaszów Mazowiecki,Łódzkie,POL,Poland,,,
145035,148747,Kamila Stormowska,2000-04-12,Elbląg,Warmińsko-Mazurskie,POL,Poland,,,
145036,148748,Łukasz Kuczyński,1999-06-23,Sokolka,Podlaskie,POL,Poland,,,


In [4]:
#to randomly sample rows from a DataFrame
bios.sample(5)

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
54643,55036,Zoran Petrović,1960-08-22,Beograd (Belgrade),Beograd,SRB,Yugoslavia,203.0,98.0,
134504,137574,Randi Griffin,1988-09-02,Apex,North Carolina,USA,Korea Team,165.0,58.0,
24445,24634,Henning Enoksen,1935-09-26,,,,Denmark,178.0,70.0,2016-09-25
124550,126855,Kyu Maung,1970-08-11,,,,Myanmar,165.0,60.0,
8767,8814,Miguel Amarista,1939-09-30,,,,Venezuela,173.0,63.0,


In [6]:
#you can install packages in Jupyter Notebook using pip
!pip install matplotlib

