# Intro to dataframes

In [None]:
import pandas as pd

df = pd.DataFrame([[1,2,3],[4,5,6],[7,8,9]], columns=["A", "B", "C"], index=['x','y','z'])

In [None]:
df

In [None]:
df.head(2) # head shows the top 5 lines or the number of lines you passed

In [None]:
df.tail(1) # tail is the same as head, but from the bottom

In [None]:
df.columns # returns the name of the columns

In [None]:
df.index # returns the name of the rows (index)

In [None]:
df.columns.tolist()
df.index.tolist() # tolist() gets the values into a list

In [None]:
df.info() # gives you info about your dataframe

In [None]:
df.describe() # returns a description of statistics of the data

In [None]:
df.nunique() # returns how many unique values are in each column

In [None]:
df.shape # returns n * m or row * columns lenght

In [None]:
df.size

---

# Loading in Dataframes in Files

In [None]:
coffee = pd.read_csv('https://raw.githubusercontent.com/KeithGalli/complete-pandas-tutorial/refs/heads/master/warmup-data/coffee.csv')

In [None]:
coffee.head()

In [None]:
olympics_data = pd.read_excel('../data/test/olympics-data.xlsx') # using openpyxl

In [None]:
results = pd.read_parquet('../data/test/results.parquet') # using pyarrow

In [None]:
bios = pd.read_csv('https://raw.githubusercontent.com/KeithGalli/complete-pandas-tutorial/refs/heads/master/data/bios.csv')

In [None]:
bios.to_excel('../data/test/bios.xlsx') # converting csv data into an excel file

In [None]:
bios.to_parquet('../data/test/bios.parquet') # converting csv data into a parquet file

---

# Accessing Data with Pandas

In [None]:
coffee.sample(10, random_state=1) # returns random selected data
# we can pass random_state so it do not change the data in every run

In [None]:
coffee.loc[0:3, ["Day", "Units Sold"]] # coffee.loc[rows, cols]: coffee.loc[[1,2,3]], coffee.loc[0:3], coffee.loc[[1,2,3], ["Day", "Units Sold]]
# returns the specified rows and columns

In [None]:
coffee.iloc[0:3, [0, 2]] # returns the specified rows and columns using indexes
# notice that the upper bound is not included in iloc, but it is in loc

In [None]:
coffee.index = coffee.Day # sets the rows' names from numbers to days
# We could have written 'coffee["Day"] also

In [None]:
coffee.head() # we could see that coffee.index is different now

In [None]:
coffee.loc["Monday": "Wednesday", "Units Sold"] # now we can use the days strings to choose the rows

In [None]:
coffee.iloc[0:2] # iloc keeps working the same way

In [None]:
coffee.iloc[1, 2] = 10 # we can change one or multiple values ([1:3, 2]) like this

In [None]:
coffee.head() # row index 1 (Monday) and col index 2 (Units Sold) is now 10

In [None]:
coffee.at["Monday", "Units Sold"] # gets a specific value

In [None]:
coffee.iat[0,0] # gets a specific value by index

In [None]:
coffee.sort_values(["Units Sold", "Coffee Type"], ascending=[1, 1]) # sort values by units sold and then by coffee type if some values are the same
# ascending is optional to describe the behavior of the sorting by index for the first list

In [None]:
for index, row in coffee.iterrows(): # we can iterate through coffee, but it loses some performance 
    print(index) # only use when it is necessary, otherwise use pandas built-in functions instead
    print(row)
    print("\n")

---

# Filtering Data

In [None]:
bios.head()

In [None]:
bios.tail()

In [None]:
bios.info()

In [None]:
bios.loc[bios['height_cm'] > 215, ['name', 'height_cm']] # we can use loc to filter data like this

In [None]:
bios[bios['height_cm'] > 215][['name', 'height_cm']] # the same as the line before, but a short syntax version

In [None]:
bios[(bios['height_cm'] > 215) & (bios['born_country'] == 'USA')] # the same as the line before, but a short syntax version

In [None]:
bios[bios['name'].str.contains('keith', case=False)] # search name col in bios with name containing the name keith (case=False is to ignore capitalization)

In [None]:
bios[bios['name'].str.contains('keith|patrick', case=False)] # we can use regex syntax too like 'keith|patrick'
# if we want to make regex not possible to use we can add 'regex=False' property

---

### Regular expressions to filter data

In [None]:
bios[bios['name'].str.contains(r'^a.*a$', case=False, regex=True, na=False)] # Find names starting with 'a' and ending with 'a'

---

In [None]:
bios[bios['born_country'].isin(['USA', 'FRA', 'GBR']) & bios['name'].str.startswith('Jessie')] 

In [None]:
bios.query('born_country == "USA" and born_city == "Seattle"')