In [69]:
import pandas as pd
import numpy as np

In [130]:
#  1
data = {
    "Name" : ["Alice", "Bob", "Charlie", "Diana", "James"],
    "Age": [24, 27, 22, 29, 17],
    "Gender" : ["Female", "Male", "Male", "Female", "Male"],
    "City": ["New York", "Los Angeles", "Chicago", "Houston", "Texas"]
}

df = pd.DataFrame(data=data)
df

Unnamed: 0,Name,Age,Gender,City
0,Alice,24,Female,New York
1,Bob,27,Male,Los Angeles
2,Charlie,22,Male,Chicago
3,Diana,29,Female,Houston
4,James,17,Male,Texas


In [112]:
# 2 - Filter Rows
age_greater_than_25 = df[df["Age"].gt(25)]
age_greater_than_25

Unnamed: 0,Name,Age,Gender,City
1,Bob,27,Male,Los Angeles
3,Diana,29,Female,Houston


In [113]:
gender_is_female = df[df["Gender"].eq("Female")]
gender_is_female

Unnamed: 0,Name,Age,Gender,City
0,Alice,24,Female,New York
3,Diana,29,Female,Houston


In [114]:
# 3 - Add a New Column
df["Is_Adult"] = df["Age"].apply(lambda age: age >= 18)
df

Unnamed: 0,Name,Age,Gender,City,Is_Adult
0,Alice,24,Female,New York,True
1,Bob,27,Male,Los Angeles,True
2,Charlie,22,Male,Chicago,True
3,Diana,29,Female,Houston,True
4,James,17,Male,Texas,False


In [115]:
# 4 - Sort df by Age
df = df.sort_values(by=["Age"], ascending=False)
df

Unnamed: 0,Name,Age,Gender,City,Is_Adult
3,Diana,29,Female,Houston,True
1,Bob,27,Male,Los Angeles,True
0,Alice,24,Female,New York,True
2,Charlie,22,Male,Chicago,True
4,James,17,Male,Texas,False


In [131]:
# 5 - Group By and Aggregate
grouped_df = df.groupby(by="Gender").agg(average_age = ("Age", "mean"), count = ("Age", "count"))
grouped_df

Unnamed: 0_level_0,average_age,count
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,26.5,2
Male,22.0,3


In [125]:
#  6
data = {
    "Name" : ["Alice", "Bob", "Charlie", "Diana", "James"],
    "Age": [24, 27, 22, None, 17],
    "Gender" : ["Female", "Male", "Male", "Female", "Male"],
    "City": ["New York", "Los Angeles", "Chicago", "Houston", "Texas"]
}

new_df = pd.DataFrame(data=data)
new_df

Unnamed: 0,Name,Age,Gender,City
0,Alice,24.0,Female,New York
1,Bob,27.0,Male,Los Angeles
2,Charlie,22.0,Male,Chicago
3,Diana,,Female,Houston
4,James,17.0,Male,Texas


In [133]:
new_df["Age"].fillna({"Age": new_df["Age"].mean()}, inplace=True)
new_df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  new_df["Age"].fillna({"Age": new_df["Age"].mean()}, inplace=True)


Unnamed: 0,Name,Age,Gender,City
0,Alice,24.0,Female,New York
1,Bob,27.0,Male,Los Angeles
2,Charlie,22.0,Male,Chicago
3,Diana,22.5,Female,Houston
4,James,17.0,Male,Texas


In [134]:
new_df.to_csv("output.csv", index=False)

In [139]:
# 8 - Load and Analyze
output_loaded = pd.read_csv("output.csv")
output_loaded

# total_rows
total_rows = output_loaded.shape[0]
print("Total Rows",total_rows)

# Averge Age
average_age = output_loaded["Age"].mean()
print("Average Age",average_age)


Total Rows 5
Average Age 22.5


In [140]:
# 9 - Create a Pivot Table
pivote_table = output_loaded.pivot_table(index="City", columns="Gender", aggfunc="size", fill_value=0)
pivote_table

Gender,Female,Male
City,Unnamed: 1_level_1,Unnamed: 2_level_1
Chicago,0,1
Houston,1,0
Los Angeles,0,1
New York,1,0
Texas,0,1
