# Intro to dataframes

In [2]:
import pandas as pd

df = pd.DataFrame([[1,2,3],[4,5,6],[7,8,9]], columns=["A", "B", "C"], index=['x','y','z'])

In [3]:
df

Unnamed: 0,A,B,C
x,1,2,3
y,4,5,6
z,7,8,9


In [4]:
df.head(2) # head shows the top 5 lines or the number of lines you passed

Unnamed: 0,A,B,C
x,1,2,3
y,4,5,6


In [5]:
df.tail(1) # tail is the same as head, but from the bottom

Unnamed: 0,A,B,C
z,7,8,9


In [6]:
df.columns # returns the name of the columns

Index(['A', 'B', 'C'], dtype='object')

In [7]:
df.index # returns the name of the rows (index)

Index(['x', 'y', 'z'], dtype='object')

In [8]:
df.columns.tolist()
df.index.tolist() # tolist() gets the values into a list

['x', 'y', 'z']

In [9]:
df.info() # gives you info about your dataframe

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, x to z
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A       3 non-null      int64
 1   B       3 non-null      int64
 2   C       3 non-null      int64
dtypes: int64(3)
memory usage: 96.0+ bytes


In [10]:
df.describe() # returns a description of statistics of the data

Unnamed: 0,A,B,C
count,3.0,3.0,3.0
mean,4.0,5.0,6.0
std,3.0,3.0,3.0
min,1.0,2.0,3.0
25%,2.5,3.5,4.5
50%,4.0,5.0,6.0
75%,5.5,6.5,7.5
max,7.0,8.0,9.0


In [11]:
df.nunique() # returns how many unique values are in each column

A    3
B    3
C    3
dtype: int64

In [12]:
df.shape # returns n * m or row * columns lenght

(3, 3)

In [13]:
df.size

9

---

# Loading in Dataframes in Files

In [71]:
coffee = pd.read_csv('https://raw.githubusercontent.com/KeithGalli/complete-pandas-tutorial/refs/heads/master/warmup-data/coffee.csv')

In [15]:
coffee.head()

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,15
2,Tuesday,Espresso,30
3,Tuesday,Latte,20
4,Wednesday,Espresso,35


In [16]:
olympics_data = pd.read_excel('../data/test/olympics-data.xlsx') # using openpyxl

In [17]:
results = pd.read_parquet('../data/test/results.parquet') # using pyarrow

In [18]:
bios = pd.read_csv('https://raw.githubusercontent.com/KeithGalli/complete-pandas-tutorial/refs/heads/master/data/bios.csv')

In [19]:
bios.to_excel('../data/test/bios.xlsx') # converting csv data into an excel file

In [20]:
bios.to_parquet('../data/test/bios.parquet') # converting csv data into a parquet file

---

# Accessing Data with Pandas

In [21]:
coffee.sample(10, random_state=1) # returns random selected data
# we can pass random_state so it do not change the data in every run

Unnamed: 0,Day,Coffee Type,Units Sold
3,Tuesday,Latte,20
7,Thursday,Latte,30
6,Thursday,Espresso,40
2,Tuesday,Espresso,30
10,Saturday,Espresso,45
4,Wednesday,Espresso,35
1,Monday,Latte,15
12,Sunday,Espresso,45
0,Monday,Espresso,25
13,Sunday,Latte,35


In [22]:
coffee.loc[0:3, ["Day", "Units Sold"]] # coffee.loc[rows, cols]: coffee.loc[[1,2,3]], coffee.loc[0:3], coffee.loc[[1,2,3], ["Day", "Units Sold]]
# returns the specified rows and columns

Unnamed: 0,Day,Units Sold
0,Monday,25
1,Monday,15
2,Tuesday,30
3,Tuesday,20


In [23]:
coffee.iloc[0:3, [0, 2]] # returns the specified rows and columns using indexes
# notice that the upper bound is not included in iloc, but it is in loc

Unnamed: 0,Day,Units Sold
0,Monday,25
1,Monday,15
2,Tuesday,30


In [24]:
coffee.index = coffee.Day # sets the rows' names from numbers to days
# We could have written 'coffee["Day"] also

In [25]:
coffee.head() # we could see that coffee.index is different now

Unnamed: 0_level_0,Day,Coffee Type,Units Sold
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Monday,Monday,Espresso,25
Monday,Monday,Latte,15
Tuesday,Tuesday,Espresso,30
Tuesday,Tuesday,Latte,20
Wednesday,Wednesday,Espresso,35


In [26]:
coffee.loc["Monday": "Wednesday", "Units Sold"] # now we can use the days strings to choose the rows

Day
Monday       25
Monday       15
Tuesday      30
Tuesday      20
Wednesday    35
Wednesday    25
Name: Units Sold, dtype: int64

In [27]:
coffee.iloc[0:2] # iloc keeps working the same way

Unnamed: 0_level_0,Day,Coffee Type,Units Sold
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Monday,Monday,Espresso,25
Monday,Monday,Latte,15


In [28]:
coffee.iloc[1, 2] = 10 # we can change one or multiple values ([1:3, 2]) like this

In [29]:
coffee.head() # row index 1 (Monday) and col index 2 (Units Sold) is now 10

Unnamed: 0_level_0,Day,Coffee Type,Units Sold
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Monday,Monday,Espresso,25
Monday,Monday,Latte,10
Tuesday,Tuesday,Espresso,30
Tuesday,Tuesday,Latte,20
Wednesday,Wednesday,Espresso,35


In [30]:
coffee.at["Monday", "Units Sold"] # gets a specific value

Day
Monday    25
Monday    10
Name: Units Sold, dtype: int64

In [31]:
coffee.iat[0,0] # gets a specific value by index

'Monday'

In [32]:
coffee.sort_values(["Units Sold", "Coffee Type"], ascending=[1, 1]) # sort values by units sold and then by coffee type if some values are the same
# ascending is optional to describe the behavior of the sorting by index for the first list

Unnamed: 0_level_0,Day,Coffee Type,Units Sold
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Monday,Monday,Latte,10
Tuesday,Tuesday,Latte,20
Monday,Monday,Espresso,25
Wednesday,Wednesday,Latte,25
Tuesday,Tuesday,Espresso,30
Thursday,Thursday,Latte,30
Wednesday,Wednesday,Espresso,35
Friday,Friday,Latte,35
Saturday,Saturday,Latte,35
Sunday,Sunday,Latte,35


In [33]:
for index, row in coffee.iterrows(): # we can iterate through coffee, but it loses some performance 
    print(index) # only use when it is necessary, otherwise use pandas built-in functions instead
    print(row)
    print("\n")

Monday
Day              Monday
Coffee Type    Espresso
Units Sold           25
Name: Monday, dtype: object


Monday
Day            Monday
Coffee Type     Latte
Units Sold         10
Name: Monday, dtype: object


Tuesday
Day             Tuesday
Coffee Type    Espresso
Units Sold           30
Name: Tuesday, dtype: object


Tuesday
Day            Tuesday
Coffee Type      Latte
Units Sold          20
Name: Tuesday, dtype: object


Wednesday
Day            Wednesday
Coffee Type     Espresso
Units Sold            35
Name: Wednesday, dtype: object


Wednesday
Day            Wednesday
Coffee Type        Latte
Units Sold            25
Name: Wednesday, dtype: object


Thursday
Day            Thursday
Coffee Type    Espresso
Units Sold           40
Name: Thursday, dtype: object


Thursday
Day            Thursday
Coffee Type       Latte
Units Sold           30
Name: Thursday, dtype: object


Friday
Day              Friday
Coffee Type    Espresso
Units Sold           45
Name: Friday, dtype: object


---

# Filtering Data

In [34]:
bios.head()

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
0,1,Jean-François Blanchy,1886-12-12,Bordeaux,Gironde,FRA,France,,,1960-10-02
1,2,Arnaud Boetsch,1969-04-01,Meulan,Yvelines,FRA,France,183.0,76.0,
2,3,Jean Borotra,1898-08-13,Biarritz,Pyrénées-Atlantiques,FRA,France,183.0,76.0,1994-07-17
3,4,Jacques Brugnon,1895-05-11,Paris VIIIe,Paris,FRA,France,168.0,64.0,1978-03-20
4,5,Albert Canet,1878-04-17,Wandsworth,England,GBR,France,,,1930-07-25


In [35]:
bios.tail()

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
145495,149222,Polina Luchnikova,2002-01-30,Serov,Sverdlovsk,RUS,ROC,167.0,61.0,
145496,149223,Valeriya Merkusheva,1999-09-20,Moskva (Moscow),Moskva,RUS,ROC,168.0,65.0,
145497,149224,Yuliya Smirnova,1998-05-08,Kotlas,Arkhangelsk,RUS,ROC,163.0,55.0,
145498,149225,André Foussard,1899-05-19,Niort,Deux-Sèvres,FRA,France,166.0,,1986-03-18
145499,149814,Bill Phillips,1913-07-15,Dulwich Hill,New South Wales,AUS,Australia,,,2003-10-20


In [36]:
bios.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145500 entries, 0 to 145499
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   athlete_id    145500 non-null  int64  
 1   name          145500 non-null  object 
 2   born_date     143693 non-null  object 
 3   born_city     110908 non-null  object 
 4   born_region   110908 non-null  object 
 5   born_country  110908 non-null  object 
 6   NOC           145499 non-null  object 
 7   height_cm     106651 non-null  float64
 8   weight_kg     102070 non-null  float64
 9   died_date     33940 non-null   object 
dtypes: float64(2), int64(1), object(7)
memory usage: 11.1+ MB


In [37]:
bios.loc[bios['height_cm'] > 215, ['name', 'height_cm']] # we can use loc to filter data like this

Unnamed: 0,name,height_cm
5089,Viktor Pankrashkin,220.0
5583,Paulinho Villas Boas,217.0
5673,Gunther Behnke,221.0
5716,Uwe Blab,218.0
5781,Tommy Burleson,223.0
5796,Andy Campbell,218.0
6223,Lars Hansen,216.0
6270,Hu Zhangbao,216.0
6409,Sergey Kovalenko,216.0
6420,Jānis Krūmiņš,218.0


In [38]:
bios[bios['height_cm'] > 215][['name', 'height_cm']] # the same as the line before, but a short syntax version

Unnamed: 0,name,height_cm
5089,Viktor Pankrashkin,220.0
5583,Paulinho Villas Boas,217.0
5673,Gunther Behnke,221.0
5716,Uwe Blab,218.0
5781,Tommy Burleson,223.0
5796,Andy Campbell,218.0
6223,Lars Hansen,216.0
6270,Hu Zhangbao,216.0
6409,Sergey Kovalenko,216.0
6420,Jānis Krūmiņš,218.0


In [39]:
bios[(bios['height_cm'] > 215) & (bios['born_country'] == 'USA')] # the same as the line before, but a short syntax version

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
5781,5804,Tommy Burleson,1952-02-24,Crossnore,North Carolina,USA,United States,223.0,102.0,
6722,6755,Shaquille O'Neal,1972-03-06,Newark,New Jersey,USA,United States,216.0,137.0,
6937,6972,David Robinson,1965-08-06,Key West,Florida,USA,United States,216.0,107.0,
123850,126093,Tyson Chandler,1982-10-02,Hanford,California,USA,United States,216.0,107.0,


In [40]:
bios[bios['name'].str.contains('keith', case=False)] # search name col in bios with name containing the name keith (case=False is to ignore capitalization)

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
1897,1907,Keith Hanlon,1966-09-01,,,,Ireland,,,
3505,3517,Keith Wallace,1961-03-29,Preston,England,GBR,Great Britain,165.0,51.0,1999-12-31
6228,6255,Keith Hartley,1940-10-15,Vancouver,British Columbia,CAN,Canada,200.0,85.0,
8898,8946,Keith Mwila,1966-01-01,,,,Zambia,,,1993-01-09
12053,12118,Keith Hervey,1898-11-03,Fulham,England,GBR,Great Britain,,,1973-02-22
...,...,...,...,...,...,...,...,...,...,...
109900,111105,Keith Cumberpatch,1927-08-25,Christchurch,Canterbury,NZL,New Zealand,,,2013-11-15
115973,117348,Keith Sanderson,1975-02-02,Plymouth,Massachusetts,USA,United States,183.0,95.0,
117676,119195,Duncan Keith,1983-07-16,Winnipeg,Manitoba,CAN,Canada,185.0,88.0,
122121,124176,Keith Ferguson,1979-09-07,Sale,Victoria,AUS,Australia,176.0,78.0,


In [41]:
bios[bios['name'].str.contains('keith|patrick', case=False)] # we can use regex syntax too like 'keith|patrick'
# if we want to make regex not possible to use we can add 'regex=False' property

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
6,7,Patrick Chila,1969-11-27,Ris-Orangis,Essonne,FRA,France,180.0,73.0,
119,120,Patrick Wheatley,1899-01-20,Vryheid,KwaZulu-Natal,RSA,Great Britain,,,1967-11-05
319,320,Patrick De Koning,1961-04-23,Dendermonde,Oost-Vlaanderen,BEL,Belgium,178.0,92.0,
1897,1907,Keith Hanlon,1966-09-01,,,,Ireland,,,
2115,2125,Patrick Jopp,1962-01-08,,,,Switzerland,176.0,67.0,
...,...,...,...,...,...,...,...,...,...,...
143975,147633,Patrick Chinyemba,2001-01-03,,,,Zambia,,,
144172,147850,Patrick Jakob,1996-10-17,Sankt Johann in Tirol,Tirol,AUT,Austria,,,
144547,148239,Patrick Galbraith,1986-03-11,Haderslev,Syddanmark,DEN,Denmark,,,
144565,148257,Patrick Russell,1993-01-04,Gentofte,Hovedstaden,DEN,Denmark,186.0,93.0,


---

### Regular expressions to filter data

In [42]:
bios[bios['name'].str.contains(r'^a.*a$', case=False, regex=True, na=False)] # Find names starting with 'a' and ending with 'a'

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
254,255,Alan Budikusuma,1968-03-29,Surabaya,Jawa Timur,INA,Indonesia,178.0,71.0,
267,268,Ardy Wiranata,1970-02-10,Jakarta,Daerah Khusus Ibukota Jakarta,INA,Indonesia,171.0,67.0,
271,272,Ambika Radhika,1973-05-17,,,,India,,,
419,420,Ascensión Guerra,1955-02-28,Madrid,Madrid,ESP,Spain,171.0,63.0,
705,709,Andrea Vieira,1971-02-05,São Paulo,São Paulo,BRA,Brazil,,,
...,...,...,...,...,...,...,...,...,...,...
145152,148869,Anna Vostrikova,2002-08-24,Rybinsk,Yaroslavl,RUS,ROC,,,
145197,148915,Amy Baserga,2000-09-29,Zürich,Zürich,SUI,Switzerland,,,
145334,149056,Anastasiya Shabotova,2006-01-17,Moskva (Moscow),Moskva,RUS,Ukraine,,,
145448,149172,Aleksandra Glazkova,2006-03-15,Miass,Chelyabinsk,RUS,ROC,,,


---

In [43]:
bios[bios['born_country'].isin(['USA', 'FRA', 'GBR']) & bios['name'].str.startswith('Jessie')] 

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
557,559,Jessie Wadworth,1863-11-12,Devizes,England,GBR,Great Britain,,,1936-07-08
28242,28454,Jessie Kite,1892-05-17,Hackney,England,GBR,Great Britain,,,1958-05-01
77203,77794,Jessie Cross,1909-04-14,New York,New York,USA,United States,,,1986-03-29
117711,119236,Jessie Vetter,1985-12-19,Madison,Wisconsin,USA,United States,173.0,70.0,
126338,128689,Jessie Diggins,1991-08-26,Minneapolis,Minnesota,USA,United States,163.0,58.0,


In [44]:
bios.query('born_country == "USA" and born_city == "Seattle"')

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
11030,11088,David Halpern,1955-08-18,Seattle,Washington,USA,United States,178.0,79.0,
12800,12870,Todd Trewin,1958-04-20,Seattle,Washington,USA,United States,180.0,75.0,
15476,15583,Scott McKinley,1968-10-15,Seattle,Washington,USA,United States,183.0,75.0,
29079,29293,Joyce Tanac,1950-09-27,Seattle,Washington,USA,United States,156.0,49.0,
31135,31371,Bill Kuhlemeier,1908-01-14,Seattle,Washington,USA,United States,,,2001-07-08
...,...,...,...,...,...,...,...,...,...,...
133392,136331,Hans Struzyna,1989-03-31,Seattle,Washington,USA,United States,188.0,91.0,
135448,138662,Maude Davis Crossland,2003-03-19,Seattle,Washington,USA,Colombia,,,
136993,140229,Jenell Berhorst,2003-12-13,Seattle,Washington,USA,United States,,,
143507,147159,Nevin Harrison,2002-06-02,Seattle,Washington,USA,United States,175.0,73.0,


---

# Adding / Removing Columns

In [45]:
import numpy as np
coffee['new_price'] = np.where(coffee['Coffee Type'] == 'Espresso', 3.99, 5.99) # creates a new column and gives the value of 3.99 to the Espresso coffe type and 5.99 for the other ones

In [46]:
coffee.drop("Monday") # it deletes the 'Monday' index rows temporarily

Unnamed: 0_level_0,Day,Coffee Type,Units Sold,new_price
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Tuesday,Tuesday,Espresso,30,3.99
Tuesday,Tuesday,Latte,20,5.99
Wednesday,Wednesday,Espresso,35,3.99
Wednesday,Wednesday,Latte,25,5.99
Thursday,Thursday,Espresso,40,3.99
Thursday,Thursday,Latte,30,5.99
Friday,Friday,Espresso,45,3.99
Friday,Friday,Latte,35,5.99
Saturday,Saturday,Espresso,45,3.99
Saturday,Saturday,Latte,35,5.99


In [47]:
coffee['price'] = 4.99

In [48]:
coffee.drop(columns=['price']) # eliminates price column temporarily

Unnamed: 0_level_0,Day,Coffee Type,Units Sold,new_price
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Monday,Monday,Espresso,25,3.99
Monday,Monday,Latte,10,5.99
Tuesday,Tuesday,Espresso,30,3.99
Tuesday,Tuesday,Latte,20,5.99
Wednesday,Wednesday,Espresso,35,3.99
Wednesday,Wednesday,Latte,25,5.99
Thursday,Thursday,Espresso,40,3.99
Thursday,Thursday,Latte,30,5.99
Friday,Friday,Espresso,45,3.99
Friday,Friday,Latte,35,5.99


In [49]:
coffee.drop(columns=['price'], inplace=True) # eliminates price column permanently

In [50]:
coffee = coffee.drop(columns=['price']) # this should work too

KeyError: "['price'] not found in axis"

In [None]:
coffee = coffee[['Day', 'Coffee Type', 'Units Sold', 'new_price']] # this should work too

In [None]:
coffee['revenue'] = coffee['Units Sold'] * coffee['new_price'] # we can create a new column with the values of another columns

In [None]:
coffee.rename(columns={'new_price': 'price'}) # this is temporary, if we want to make it permanently, we need to add 'inplace=True' or 'coffee = ...' just like we did with 'drop'

In [None]:
bios_new = bios.copy()

In [None]:
bios_new['first_name'] = bios_new['name'].str.split(' ').str[0]

In [None]:
bios_new.query('first_name == "Jessy"')

In [None]:
bios_new.info()

In [None]:
bios_new['born_datetime'] = pd.to_datetime(bios_new['born_date'])

In [None]:
bios_new.info()

In [None]:
bios_new['born_year'] = bios_new['born_datetime'].dt.year

bios_new[['name', 'born_year']]

In [None]:
bios_new.to_csv('../data/test/bios_new.csv', index=False)

In [None]:
bios_new['height_category'] = bios_new['height_cm'].apply(lambda x: "Short" if x < 165 else ("Average" if x < 185 else "Tall"))

bios_new[['name', 'height_cm', 'height_category']]

In [None]:
def categorize_athlete(row):
    if row['height_cm'] < 175 or row['weight_kg'] <= 70:
        return 'Lightweight'
    elif row['height_cm'] < 185 or row['weight_kg'] <= 80:
        return 'Middleweight'
    else:
        return 'Heavyweight'

bios['Category'] = bios.apply(categorize_athlete, axis=1) # axis = 1 is rows and axios = 0 is cols

In [None]:
bios

---

# Merging & Concatenating Data

In [None]:
nocs = pd.read_csv('../data/test/noc_regions.csv')

In [None]:
nocs.head()

In [None]:
bios_new = pd.merge(bios, nocs, left_on='born_country', right_on='NOC', how='left', suffixes=['_bios', '_nocsdf'])

In [52]:
bios_new

NameError: name 'bios_new' is not defined

In [53]:
bios_new.rename(columns={'region': 'born_country_full'}, inplace=True)

NameError: name 'bios_new' is not defined

In [54]:
bios_new

NameError: name 'bios_new' is not defined

In [55]:
bios_new[bios_new['NOC_bios'] != bios_new['region']][['name', 'NOC_bios', 'region']]

NameError: name 'bios_new' is not defined

In [56]:
usa = bios[bios['born_country'] == 'USA'].copy()
gbr = bios[bios['born_country'] == 'GBR'].copy()

In [57]:
new_df = pd.concat([usa, gbr])

In [58]:
new_df

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
54,55,Monique Javer,1967-07-22,Burlingame,California,USA,Great Britain,177.0,64.0,
960,964,Xóchitl Escobedo,1968-09-17,West Covina,California,USA,Mexico,170.0,60.0,
961,965,Angélica Gavaldón,1973-10-03,El Centro,California,USA,Mexico,160.0,54.0,
1231,1238,Bert Schneider,1897-07-01,Cleveland,Ohio,USA,Canada,,,1986-02-20
1345,1352,Laura Berg,1975-01-06,Santa Fe Springs,California,USA,United States,168.0,61.0,
...,...,...,...,...,...,...,...,...,...,...
144811,148512,Benjamin Alexander,1983-05-08,London,England,GBR,Jamaica,,,
144815,148517,Ashley Watson,1993-10-28,Peterborough,England,GBR,Jamaica,,,
145005,148716,Peder Kongshaug,2001-08-13,Wimbledon,England,GBR,Norway,184.0,86.0,
145319,149041,Axel Brown,1992-04-02,Harrogate,England,GBR,Trinidad and Tobago,,,


In [59]:
results.head()

Unnamed: 0,year,type,discipline,event,as,athlete_id,noc,team,place,tied,medal
0,1912.0,Summer,Tennis,"Singles, Men (Olympic)",Jean-François Blanchy,1,FRA,,17.0,True,
1,1912.0,Summer,Tennis,"Doubles, Men (Olympic)",Jean-François Blanchy,1,FRA,Jean Montariol,,False,
2,1920.0,Summer,Tennis,"Singles, Men (Olympic)",Jean-François Blanchy,1,FRA,,32.0,True,
3,1920.0,Summer,Tennis,"Doubles, Mixed (Olympic)",Jean-François Blanchy,1,FRA,Jeanne Vaussard,8.0,True,
4,1920.0,Summer,Tennis,"Doubles, Men (Olympic)",Jean-François Blanchy,1,FRA,Jacques Brugnon,4.0,False,


In [60]:
combined_df = pd.merge(results, bios, on='athlete_id', how='left')

In [None]:
combined_df.head()

---

# Handling Null Values

In [72]:
coffee

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,15
2,Tuesday,Espresso,30
3,Tuesday,Latte,20
4,Wednesday,Espresso,35
5,Wednesday,Latte,25
6,Thursday,Espresso,40
7,Thursday,Latte,30
8,Friday,Espresso,45
9,Friday,Latte,35


In [74]:
coffee.loc[[0,1], 'Units Sold'] = np.nan # we can set values to NaN like this

In [77]:
coffee.head() # values 0, 1 are now NaN
coffee.isna().sum() # we can check how many NaN values are in each column

Day            0
Coffee Type    0
Units Sold     2
dtype: int64

In [84]:
coffee.fillna(coffee['Units Sold'].mean(), inplace=True) # we can fill NaN values with the mean of the column
coffee.head()

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,35.0
1,Monday,Latte,35.0
2,Tuesday,Espresso,30.0
3,Tuesday,Latte,20.0
4,Wednesday,Espresso,35.0


In [87]:
coffee.loc[[2,3], 'Units Sold'] = np.nan # we can set values to NaN like this
coffee['Units Sold'].interpolate() # we can use interpolate to fill NaN values

0     35.0
1     35.0
2     35.0
3     35.0
4     35.0
5     25.0
6     40.0
7     30.0
8     45.0
9     35.0
10    45.0
11    35.0
12    45.0
13    35.0
Name: Units Sold, dtype: float64

In [None]:
coffee.loc[[2,3], 'Units Sold'] = np.nan # we can set values to NaN like this
coffee.dropna(subset=['Units Sold'], inplace=True) # we can drop NaN values like this
coffee.head()

InvalidIndexError: ([1, 2], ['Units Sold'])

In [None]:
coffee[coffee['Units Sold'].isna()] # we can check if there are any NaN values in the column