# Part 2: Full Data Workflow A-Z

## Pandas GroupBy Operations

### Understanding GroupBy objects

In [None]:
import pandas as pd

In [None]:
titanic = pd.read_csv("titanic.csv")

In [None]:
titanic.head()

In [None]:
titanic.tail()

In [None]:
titanic.info()

In [None]:
titanic_slice = titanic.iloc[:10, [2,3]]

In [None]:
titanic_slice

In [None]:
titanic_slice.groupby("sex")

In [None]:
gbo = titanic_slice.groupby("sex")

In [None]:
type(gbo)

In [None]:
gbo.groups

In [None]:
l = list(gbo)

In [None]:
l

In [None]:
len(l)

In [None]:
l[0]

In [None]:
type(l[0])

In [None]:
l[0][0]

In [None]:
l[0][1]

In [None]:
type(l[0][1])

In [None]:
l[1]

In [None]:
titanic_slice.loc[titanic_slice.sex == "female"]

In [None]:
titanic_slice_f = titanic_slice.loc[titanic_slice.sex == "female"]
titanic_slice_f

In [None]:
titanic_slice_m = titanic_slice.loc[titanic_slice.sex == "male"]
titanic_slice_m

In [None]:
titanic_slice_f.equals(l[0][1])

In [None]:
for element in gbo:
    print(element[1])

### Splitting with many Keys

In [None]:
import pandas as pd

In [None]:
summer = pd.read_csv("summer.csv")

In [None]:
summer.head()

In [None]:
summer.info()

In [None]:
summer.Country.nunique()

In [None]:
split1 = summer.groupby("Country")

In [None]:
l = list(split1)
l

In [None]:
len(l)

In [None]:
l[100][1]

In [None]:
split2 = summer.groupby(by = ["Country", "Gender"])

In [None]:
l2 = list(split2)
l2

In [None]:
len(l2)

In [None]:
l2[104]

In [None]:
l2[104][0]

In [None]:
l2[104][1]

### split-apply-combine explained

In [None]:
import pandas as pd

In [None]:
titanic = pd.read_csv("titanic.csv")

In [None]:
titanic_slice = titanic.iloc[:10, [2,3]]

In [None]:
titanic_slice

In [None]:
list(titanic_slice.groupby("sex"))[0][1]

In [None]:
list(titanic_slice.groupby("sex"))[1][1]

In [None]:
titanic_slice.groupby("sex").mean()

In [None]:
titanic.groupby("sex").survived.sum()

In [None]:
titanic.groupby("sex")[["fare", "age"]].max()

In [None]:
new_df = titanic.groupby("sex").mean()

In [None]:
new_df

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use("seaborn")

In [None]:
new_df.plot(kind = "bar", subplots = True, figsize = (8,15), fontsize = 13)
plt.show()

### split-apply-combine applied

In [None]:
import pandas as pd

In [None]:
summer = pd.read_csv("summer.csv")

In [None]:
summer.head()

In [None]:
summer.info()

In [None]:
medals_per_country = summer.groupby("Country").Medal.count().nlargest(n = 20)
medals_per_country

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use("seaborn")

In [None]:
medals_per_country.plot(kind = "bar", figsize = (14, 8), fontsize = 14)
plt.xlabel("Country", fontsize = 13)
plt.ylabel("No. of Medals", fontsize = 13)
plt.title("Summer Olympic Games (Total Medals per Country)", fontsize = 16)
plt.show()

In [None]:
titanic = pd.read_csv("titanic.csv")

In [None]:
titanic.head()

In [None]:
titanic.info()

In [None]:
titanic.describe()

In [None]:
titanic.fare.mean()

In [None]:
titanic.groupby("pclass").fare.mean()

In [None]:
titanic.survived.sum()

In [None]:
titanic.survived.mean()

In [None]:
titanic.groupby("sex").survived.mean()

In [None]:
titanic.groupby("pclass").survived.mean()

In [None]:
titanic["ad_chi"] = "adult"

In [None]:
titanic.loc[titanic.age < 18, "ad_chi"] = "child"

In [None]:
titanic.head(20)

In [None]:
titanic.ad_chi.value_counts()

In [None]:
titanic.groupby("ad_chi").survived.mean()

In [None]:
titanic.groupby(["sex", "ad_chi"]).survived.count()

In [None]:
titanic.groupby(["sex", "ad_chi"]).survived.mean().sort_values(ascending = False)

In [None]:
w_and_c_first = titanic.groupby(["sex", "ad_chi"]).survived.mean().sort_values(ascending = False)

In [None]:
w_and_c_first.plot(kind = "bar", figsize = (14,8), fontsize = 14)
plt.xlabel("Groups", fontsize = 13)
plt.ylabel("Survival Rate", fontsize = 13)
plt.title("Titanic Survival Rate by Sex/Age-Groups", fontsize = 16)
plt.show()

### Advanced Aggregation with agg()

In [None]:
import pandas as pd

In [None]:
titanic = pd.read_csv("titanic.csv", usecols = ["survived", "pclass", "sex", "age", "fare"])

In [None]:
titanic.head()

In [None]:
titanic.groupby("sex").mean()

In [None]:
titanic.groupby("sex").sum()

In [None]:
titanic.groupby("sex").agg(["mean", "sum", "min", "max"])

In [None]:
titanic.groupby("sex").agg({"survived": ["sum", "mean"], "pclass": "mean", "age": ["mean", "median"], "fare": "max"})

### GroupBy Aggregation with Relabeling (new in Version 0.25)

In [None]:
import pandas as pd

In [None]:
titanic = pd.read_csv("titanic.csv", usecols = ["survived", "pclass", "sex", "age", "fare"])

In [None]:
titanic.head()

In [None]:
titanic.groupby("sex").survived.mean()

In [None]:
titanic.groupby("sex").agg(survival_rate = ("survived", "mean"))

In [None]:
titanic.groupby("sex").agg({"survived": ["sum", "mean"], "age": ["mean"]})

In [None]:
titanic.groupby("sex").agg(survived_total = ("survived", "sum"), 
                           survival_rate = ("survived", "mean"), mean_age = ("age", "mean"))

### Transformation with transform()

In [None]:
import pandas as pd

In [None]:
titanic = pd.read_csv("titanic.csv")

In [None]:
titanic.head()

In [None]:
titanic.groupby(["sex", "pclass"]).survived.transform("mean")

In [None]:
titanic["group_surv_rate"] = titanic.groupby(["sex", "pclass"]).survived.transform("mean")

In [None]:
titanic.head()

In [None]:
titanic["outliers"] = abs(titanic.survived-titanic.group_surv_rate)

In [None]:
titanic[titanic.outliers > 0.85]

### Replacing NA Values by group-specific Values

In [None]:
import pandas as pd

In [None]:
titanic = pd.read_csv("titanic.csv")

In [None]:
titanic.head(20)

In [None]:
titanic.info()

In [None]:
mean_age = titanic.age.mean()
mean_age

In [None]:
titanic.age.fillna(mean_age)

In [None]:
titanic.groupby(["sex", "pclass"]).age.mean()

In [None]:
titanic["group_mean_age"] = titanic.groupby(["sex", "pclass"]).age.transform("mean")

In [None]:
titanic.head(20)

In [None]:
titanic.age.fillna(titanic.group_mean_age, inplace = True)

In [None]:
titanic.head(20)

In [None]:
titanic.info()

### Generalizing split-apply-combine with apply()

In [None]:
import pandas as pd

In [None]:
titanic = pd.read_csv("titanic.csv", usecols = ["survived", "pclass", "sex", "age", "fare"])

In [None]:
titanic.head()

In [None]:
titanic.groupby("sex").mean()

In [None]:
female_group = list(titanic.groupby("sex"))[0][1]
female_group

In [None]:
female_group.mean().astype("float")

In [None]:
def group_mean(group):
    return group.mean()

In [None]:
group_mean(female_group)

In [None]:
titanic.groupby("sex").apply(group_mean)

In [None]:
titanic.nlargest(5, "age")

In [None]:
def five_oldest_surv(group):
    return group[group.survived == 1].nlargest(5, "age")

In [None]:
titanic.groupby("sex").apply(five_oldest_surv)

### Hierarchical Indexing (MultiIndex) with Groupby

In [None]:
import pandas as pd

In [None]:
titanic = pd.read_csv("titanic.csv", usecols = ["survived", "pclass", "sex", "age", "fare"])

In [None]:
titanic

In [None]:
summary = titanic.groupby(["sex", "pclass"]).mean()

In [None]:
summary

In [None]:
summary.index

In [None]:
summary.loc[("female", 2), :]

In [None]:
summary.loc[("female", 2), "age"]

In [None]:
summary.swaplevel().sort_index()

In [None]:
summary.reset_index()

### stack() and unstack()

In [None]:
import pandas as pd

In [None]:
summer = pd.read_csv("summer.csv")

In [None]:
summer.head()

In [None]:
medals_by_country = summer.groupby(["Country", "Medal"]).Medal.count()

In [None]:
medals_by_country

In [None]:
medals_by_country.loc[("USA", "Gold")]

In [None]:
medals_by_country.shape

In [None]:
medals_by_country.unstack(level = -1)

In [None]:
medals_by_country = medals_by_country.unstack(level = -1, fill_value= 0)

In [None]:
medals_by_country.head()

In [None]:
medals_by_country.shape

In [None]:
medals_by_country = medals_by_country[["Gold", "Silver", "Bronze"]]

In [None]:
medals_by_country.sort_values(by = ["Gold", "Silver", "Bronze"], ascending = [False, False, False], inplace = True)

In [None]:
medals_by_country.head(10)

In [None]:
import matplotlib.pyplot as plt
plt.style.use("seaborn")

In [None]:
medals_by_country.head(10).plot(kind = "bar", figsize = (12,8), fontsize = 13)
plt.xlabel("Country", fontsize = 13)
plt.ylabel("Medals", fontsize = 13)
plt.title("Medals per Country", fontsize = 16)
plt.legend(fontsize = 15)
plt.show()

In [None]:
medals_by_country.stack().unstack()