In [4]:
import pandas as pd

In [7]:
df = pd.DataFrame([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]], columns=['A', 'B', 'C', 'D', 'E'], index=['row1', 'row2'])

In [None]:
df

In [None]:
df.describe()

#### loading dataframes from files

In [19]:
coffee = pd.read_csv('./warmup-data/coffee.csv')

In [None]:
coffee

In [21]:
results = pd.read_parquet('./data/results.parquet')

In [None]:
results

In [26]:
olympics_data = pd.read_excel('./data/olympics-data.xlsx',sheet_name='results')

In [None]:
olympics_data

In [68]:
coffee = pd.read_csv('./warmup-data/coffee.csv')
results = pd.read_parquet('./data/results.parquet')
bios = pd.read_csv('./data/bios.csv')

#### Importing data with pandas

In [None]:
coffee.head() # to access the first 5 rows
coffee.tail() # to access the last 5 rows
coffee.sample(5) # to access 5 random rows

In [None]:
# coffee.loc[rows, columns] # to access specific rows and columns
coffee.loc[5:8, ['Day', 'Coffee Type']] # to access specific rows and

In [None]:
# coffee.iloc[5:8, [0, 1]] # to access specific rows and columns by index
coffee.iloc[5:8, [0, 1]] # to access specific rows and columns by index
# iloc and loc is almost the same,
# the only difference is that loc uses labels and iloc uses index positions

In [60]:
coffee.index = coffee['Day'] # to set the index of the DataFrame

In [None]:
coffee.loc['Monday':'Wednesday', 'Units Sold'] # to access specific rows and columns by index labels

### Accessing data with pandas

In [67]:
coffee.loc[1:3, 'Units Sold'] = 10 # to set specific values in the DataFrame

In [None]:
coffee.head()

In [None]:
coffee.iat[0,0] # to access a specific cell by index position
coffee.at[0, 'Coffee Type'] # to access a specific cell by index label
# difference between iat and at is that iat uses index positions and at uses index labels

In [None]:
coffee.Day

In [None]:
coffee.sort_values(['Units Sold', 'Coffee Type'], ascending=[False, True]) # to sort the DataFrame by multiple columns

### Filtering data

In [None]:
bios.tail()

In [None]:
bios[bios['height_cm']>215][['name', 'height_cm']] # to filter rows based on a condition

In [None]:
# we can also set multiple conditions using & and | operators
bios[(bios['height_cm'] > 215) & (bios['weight_kg'] > 100)][['name', 'height_cm', 'weight_kg']]

In [None]:
bios[bios['name'].str.contains('Hamza|patrick')][['name', 'height_cm', 'weight_kg']]
# we can also use str.contains() to filter rows based on a string condition

In [None]:
# it is case-sensitive by default, but we can set the case parameter to False to make it case-insensitive
bios[bios['name'].str.contains('Hamza|patrick', case=False)][['name', 'height_cm', 'weight_kg']]

In [None]:
bios[(bios['born_country'].isin(['USA', 'FRA', 'GBR'])) & (bios['name'].str.startswith('Keith'))][['name', 'born_country']]

In [None]:
bios.query('born_country == "PAK"')[['name', 'born_country']] # to filter rows using query method

### Adding or removing coloums

In [41]:
coffee['Price'] = 4.99 # to add a new column with a constant value

In [None]:
coffee.head()

In [38]:
import numpy as np

In [42]:
coffee['new_price'] = np.where(coffee['Coffee Type']== 'Espresso', 5.99, 4.99) # to add a new column with a condition

In [44]:
coffee.drop(columns=['Price'], inplace=True) # to remove a column from the DataFrame

In [46]:
coffee['revenue'] = coffee['Units Sold'] * coffee['new_price'] # to add a new column with a calculation

In [None]:
coffee.head()

In [50]:
coffee = coffee.rename(columns={'new_price': 'Price', 'revenue': 'Revenue'}) # to rename columns in the DataFrame

In [53]:
bios_new = bios.copy() # to create a copy of the DataFrame

In [None]:
bios_new['first_name'] = bios_new['name'].str.split(' ').str[0] # to add a new column with the first name

In [None]:
bios_new.query('first_name == "Hamza"')[['name', 'first_name']] # to filter rows based on the new column

In [59]:
bios_new['born_datetime'] = pd.to_datetime(bios_new['born_date'])
bios_new['born_year'] = bios_new['born_datetime'].dt.year # to extract the year from the datetime column

In [None]:
bios_new[['name','born_year']]

In [61]:
bios_new.to_csv('./data/bios_new.csv', index=False) # to save the DataFrame to a CSV file

In [71]:
bios['height_category'] = bios['height_cm'].apply(lambda x: 'Tall' if x > 170 else 'Short') # to add a new column with a condition using apply
bios[['name', 'height_cm', 'height_category']].head() # to access the new column

Unnamed: 0,name,height_cm,height_category
0,Jean-François Blanchy,,Short
1,Arnaud Boetsch,183.0,Tall
2,Jean Borotra,183.0,Tall
3,Jacques Brugnon,168.0,Short
4,Albert Canet,,Short


In [69]:
def categorize_athelete(row):
    if row['weight_kg'] > 100:
        return 'Heavyweight'
    elif row['weight_kg'] < 50:
        return 'Lightweight'
    else:
        return 'Middleweight'
    
bios['Category'] = bios.apply(categorize_athelete, axis=1) # to add a new column with a function using apply

### Merging and concatinating data