**DATA FRAME**

*Category: Overview / Title*

In [147]:
# Category: Setup - import libraries
# Purpose: Import pandas for DataFrame creation and IO
import pandas as pd

*Category: DataFrame creation / examples*

In [148]:
# Category: DataFrame creation
# Purpose: create a small example DataFrame to demonstrate basic API (columns, index)
df = pd.DataFrame([[1,2,3],[4,5,6],[7,8,9]], columns=["A","B","C"], index=["X","Y","Z"])
df.head()

Unnamed: 0,A,B,C
X,1,2,3
Y,4,5,6
Z,7,8,9


*Category: DataFrame inspection*

In [149]:
# Category: DataFrame inspection - tail
df.tail(2)

Unnamed: 0,A,B,C
Y,4,5,6
Z,7,8,9


In [150]:
# Category: DataFrame inspection - columns
df.columns

Index(['A', 'B', 'C'], dtype='object')

In [151]:
# Category: DataFrame inspection - index list
df.index.tolist()

['X', 'Y', 'Z']

In [152]:
# Category: DataFrame inspection - info
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, X to Z
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A       3 non-null      int64
 1   B       3 non-null      int64
 2   C       3 non-null      int64
dtypes: int64(3)
memory usage: 96.0+ bytes


In [153]:
# Category: DataFrame inspection - nunique
df.nunique()

A    3
B    3
C    3
dtype: int64

In [154]:
# Category: DataFrame inspection - unique values in column 'A'
df['A'].unique()

array([1, 4, 7])

*Category: Shape and size*

In [155]:
# Category: Shape - rows x columns
df.shape

(3, 3)

In [156]:
# Category: Size - number of elements
df.size

9

*Category: Input / Output - reading files*

In [157]:
# Category: I/O - read coffee CSV into a DataFrame
# Purpose: demonstrate read_csv; keep the file name relative to the notebook folder
coffee = pd.read_csv("coffee.csv")
coffee.head()

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,15
2,Tuesday,Espresso,30
3,Tuesday,Latte,20
4,Wednesday,Espresso,35


In [158]:
# Category: I/O - read parquet results
results = pd.read_parquet('results.parquet')
results.head()

Unnamed: 0,year,type,discipline,event,as,athlete_id,noc,team,place,tied,medal
0,1912.0,Summer,Tennis,"Singles, Men (Olympic)",Jean-François Blanchy,1,FRA,,17.0,True,
1,1912.0,Summer,Tennis,"Doubles, Men (Olympic)",Jean-François Blanchy,1,FRA,Jean Montariol,,False,
2,1920.0,Summer,Tennis,"Singles, Men (Olympic)",Jean-François Blanchy,1,FRA,,32.0,True,
3,1920.0,Summer,Tennis,"Doubles, Mixed (Olympic)",Jean-François Blanchy,1,FRA,Jeanne Vaussard,8.0,True,
4,1920.0,Summer,Tennis,"Doubles, Men (Olympic)",Jean-François Blanchy,1,FRA,Jacques Brugnon,4.0,False,


In [159]:
# Category: I/O - read bios CSV
bios = pd.read_csv('bios.csv')
bios.head()

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
0,1,Jean-François Blanchy,1886-12-12,Bordeaux,Gironde,FRA,France,,,1960-10-02
1,2,Arnaud Boetsch,1969-04-01,Meulan,Yvelines,FRA,France,183.0,76.0,
2,3,Jean Borotra,1898-08-13,Biarritz,Pyrénées-Atlantiques,FRA,France,183.0,76.0,1994-07-17
3,4,Jacques Brugnon,1895-05-11,Paris VIIIe,Paris,FRA,France,168.0,64.0,1978-03-20
4,5,Albert Canet,1878-04-17,Wandsworth,England,GBR,France,,,1930-07-25


In [160]:
# Category: I/O - read Excel sheet
olympics_data = pd.read_excel('olympics-data.xlsx', sheet_name="results")
olympics_data.head()

Unnamed: 0,year,type,discipline,event,as,athlete_id,noc,team,place,tied,medal
0,1912.0,Summer,Tennis,"Singles, Men (Olympic)",Jean-François Blanchy,1,FRA,,17.0,True,
1,1912.0,Summer,Tennis,"Doubles, Men (Olympic)",Jean-François Blanchy,1,FRA,Jean Montariol,,False,
2,1920.0,Summer,Tennis,"Singles, Men (Olympic)",Jean-François Blanchy,1,FRA,,32.0,True,
3,1920.0,Summer,Tennis,"Doubles, Mixed (Olympic)",Jean-François Blanchy,1,FRA,Jeanne Vaussard,8.0,True,
4,1920.0,Summer,Tennis,"Doubles, Men (Olympic)",Jean-François Blanchy,1,FRA,Jacques Brugnon,4.0,False,


*Category: Safety check - ensure DataFrame loaded*

In [161]:
# Category: Safety - ensure 'coffee' exists to avoid re-loading in interactive sessions
if 'coffee' not in globals():
    coffee = pd.read_csv("coffee.csv")
coffee.head()  # DataFrame

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,15
2,Tuesday,Espresso,30
3,Tuesday,Latte,20
4,Wednesday,Espresso,35


*Category: Displaying DataFrames*

In [162]:
# Category: Display - print vs display in notebooks
print(coffee)

          Day Coffee Type  Units Sold
0      Monday    Espresso          25
1      Monday       Latte          15
2     Tuesday    Espresso          30
3     Tuesday       Latte          20
4   Wednesday    Espresso          35
5   Wednesday       Latte          25
6    Thursday    Espresso          40
7    Thursday       Latte          30
8      Friday    Espresso          45
9      Friday       Latte          35
10   Saturday    Espresso          45
11   Saturday       Latte          35
12     Sunday    Espresso          45
13     Sunday       Latte          35


In [163]:
# Category: Display - use display() for rich HTML formatting in notebooks
display(coffee)

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,15
2,Tuesday,Espresso,30
3,Tuesday,Latte,20
4,Wednesday,Espresso,35
5,Wednesday,Latte,25
6,Thursday,Espresso,40
7,Thursday,Latte,30
8,Friday,Espresso,45
9,Friday,Latte,35


*Category: Selection examples (loc/iloc)*

In [164]:
# Category: Selection - .loc with single label returns a Series (row)
coffee.loc[0]  # Row only

Day              Monday
Coffee Type    Espresso
Units Sold           25
Name: 0, dtype: object

In [165]:
# Category: Selection - .loc with list of row labels (positional indices shown for demo)
coffee.loc[[0,1,5]]  # Rows only

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,15
5,Wednesday,Latte,25


In [166]:
# Category: Selection - slice rows and pick specific columns
coffee.loc[5:9, ["Day", "Units Sold"]]

Unnamed: 0,Day,Units Sold
5,Wednesday,25
6,Thursday,40
7,Thursday,30
8,Friday,45
9,Friday,35


In [167]:
# Category: Selection - iloc uses integer positional indexing
coffee.iloc[:, [0,2]]

Unnamed: 0,Day,Units Sold
0,Monday,25
1,Monday,15
2,Tuesday,30
3,Tuesday,20
4,Wednesday,35
5,Wednesday,25
6,Thursday,40
7,Thursday,30
8,Friday,45
9,Friday,35


*Category: Indexing - set index correctly*

In [168]:
# Category: Indexing - corrected set index
# Purpose: set the 'Day' column as the DataFrame index in a safe, explicit way
# Avoid assigning coffee.index = coffee['Day'] twice which duplicates data and can cause misalignment.
#coffee = coffee.set_index('Day')
coffee.index = coffee["Day"]
coffee.head()

Unnamed: 0_level_0,Day,Coffee Type,Units Sold
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Monday,Monday,Espresso,25
Monday,Monday,Latte,15
Tuesday,Tuesday,Espresso,30
Tuesday,Tuesday,Latte,20
Wednesday,Wednesday,Espresso,35


In [169]:
coffee.loc["Monday":"Wednesday"]


Unnamed: 0_level_0,Day,Coffee Type,Units Sold
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Monday,Monday,Espresso,25
Monday,Monday,Latte,15
Tuesday,Tuesday,Espresso,30
Tuesday,Tuesday,Latte,20
Wednesday,Wednesday,Espresso,35
Wednesday,Wednesday,Latte,25


In [170]:
coffee = pd.read_csv('coffee.csv')

In [171]:
coffee.loc[1:3, "Units Sold"] = 10
coffee.head()

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,10
2,Tuesday,Espresso,10
3,Tuesday,Latte,10
4,Wednesday,Espresso,35


In [172]:
coffee.at[0,"Units Sold"] # label–based scalar accessor

np.int64(25)

In [173]:
coffee.iat[3,1] # teger position–based scalar accessor.

'Latte'

**COLUMN**

In [174]:
coffee.Day

0        Monday
1        Monday
2       Tuesday
3       Tuesday
4     Wednesday
5     Wednesday
6      Thursday
7      Thursday
8        Friday
9        Friday
10     Saturday
11     Saturday
12       Sunday
13       Sunday
Name: Day, dtype: object

In [175]:
coffee["Day"] # Safer and preferred in production code.

0        Monday
1        Monday
2       Tuesday
3       Tuesday
4     Wednesday
5     Wednesday
6      Thursday
7      Thursday
8        Friday
9        Friday
10     Saturday
11     Saturday
12       Sunday
13       Sunday
Name: Day, dtype: object

In [176]:
coffee.sort_values(["Units Sold"], ascending = False) # pandas DataFrame (and Series) method

Unnamed: 0,Day,Coffee Type,Units Sold
10,Saturday,Espresso,45
8,Friday,Espresso,45
12,Sunday,Espresso,45
6,Thursday,Espresso,40
4,Wednesday,Espresso,35
11,Saturday,Latte,35
13,Sunday,Latte,35
9,Friday,Latte,35
7,Thursday,Latte,30
0,Monday,Espresso,25


In [177]:
# ascending=[0,1] → allows different sort orders per column (first descending(false), then ascending(true))
coffee.sort_values(["Units Sold", "Coffee Type"], ascending = [0, 1]) 

Unnamed: 0,Day,Coffee Type,Units Sold
8,Friday,Espresso,45
10,Saturday,Espresso,45
12,Sunday,Espresso,45
6,Thursday,Espresso,40
4,Wednesday,Espresso,35
9,Friday,Latte,35
11,Saturday,Latte,35
13,Sunday,Latte,35
7,Thursday,Latte,30
0,Monday,Espresso,25
