# Part 2: Full Data Workflow A-Z

## Cleaning Data

### First Inspection / Handling inconsistent Data 

In [None]:
import pandas as pd

#### Titanic Dataset

In [None]:
titanic = pd.read_csv("titanic_imp.csv")

In [None]:
titanic.head()

In [None]:
titanic.tail()

In [None]:
titanic.info()

In [None]:
titanic.describe()

In [None]:
titanic.describe(include ="O")

In [None]:
titanic.Survived.unique()

In [None]:
titanic.Survived.value_counts()

In [None]:
titanic.Survived.replace(to_replace= ["yes", "no"], value = [1, 0], inplace = True)

In [None]:
titanic.Survived.value_counts()

#### Olympic Dataset

In [None]:
summer = pd.read_csv("summer_imp.csv")

In [None]:
summer.head()

In [None]:
summer.tail()

In [None]:
summer.info()

In [None]:
#summer.Athlete_Name

In [None]:
summer.rename(columns = {"Athlete Name": "Athlete_Name"}, inplace = True)

In [None]:
summer.head(20)

In [None]:
summer.Medal.value_counts()

In [None]:
summer.Medal.replace(to_replace= "Gold Medal", value = "Gold", inplace = True)

In [None]:
summer.describe(include = "O")

### String Operations

#### Titanic Dataset

In [None]:
titanic.head()

In [None]:
titanic.info()

In [None]:
#pd.to_numeric(titanic.Fare)

In [None]:
titanic.Fare = titanic.Fare.str.replace("$", "")

In [None]:
titanic.Fare.head()

#### Olympic Dataset

In [None]:
summer.head(20)

In [None]:
summer.info()

In [None]:
summer.Athlete_Name = summer.Athlete_Name.str.title()

In [None]:
summer.head(10)

In [None]:
summer.loc[summer.Athlete_Name.str.contains("Hajos")]

In [None]:
summer.iloc[0, 4]

In [None]:
summer.Athlete_Name = summer.Athlete_Name.str.strip()

In [None]:
summer.loc[summer.Athlete_Name == "Hajos, Alfred"]

In [None]:
summer.loc[summer.Athlete_Name == "Phelps, Michael"]

### Changing DataType with astype() / pd.to_numeric

#### Titanic Dataset

In [None]:
titanic.head()

In [None]:
titanic.info()

In [None]:
pd.to_numeric(titanic.Fare)

In [None]:
titanic.Fare.astype("float")

In [None]:
titanic["Fare"] = titanic.Fare.astype("float")

In [None]:
titanic["Survived"] = titanic.Survived.astype("int")

In [None]:
#titanic["Age"] = titanic.Age.astype("float")

In [None]:
titanic.info()

In [None]:
titanic.head()

#### Olympic Dataset

In [None]:
summer.head()

In [None]:
summer.info()

### Intro to NA Values

In [None]:
import numpy as np

In [None]:
sales = pd.read_csv("sales.csv", index_col = 0)

In [None]:
sales

In [None]:
sales.info()

In [None]:
sales.loc["Steven", "Thu"]

In [None]:
sales.iloc[1,1] = None

In [None]:
sales

In [None]:
sales.iloc[2,2] = np.nan

In [None]:
sales

In [None]:
sales.info()

#### Titanic Dataset

In [None]:
titanic.head(10)

In [None]:
titanic.tail(10)

In [None]:
titanic.info()

In [None]:
titanic.isna()

In [None]:
titanic.isna().sum(axis = 0)

In [None]:
titanic.isna().any(axis = 1)

In [None]:
titanic[titanic.isna().any(axis = 1)]

In [None]:
titanic.notna()

In [None]:
titanic.notna().sum(axis = 1)

In [None]:
titanic.notna().all(axis = 0)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.figure(figsize = (12,8))
sns.heatmap(titanic.notna())
plt.show()

In [None]:
titanic.Age.value_counts(dropna = False)

In [None]:
titanic.Age.replace(to_replace= "Missing Data", value = np.nan, inplace= True)

In [None]:
titanic.info()

In [None]:
titanic.Age = titanic.Age.astype("float")

#### Olympic Dataset

In [None]:
summer.head()

In [None]:
summer.info()

In [None]:
summer[summer.isna().any(axis = 1)]

### Removing Missing Values with dropna()

#### Titanic Dataset

In [None]:
titanic.head()

In [None]:
titanic.info()

In [None]:
titanic[titanic.Emb.isna()]

In [None]:
titanic.Age.value_counts(dropna = False)

In [None]:
titanic.Age.mean(skipna = True)

In [None]:
titanic.shape

In [None]:
titanic.dropna().shape

In [None]:
titanic.dropna(axis = 0, how = "any").shape

In [None]:
titanic.dropna(axis = 1, how = "any").shape

In [None]:
titanic.dropna(axis = 0, how = "all").shape

In [None]:
titanic.dropna(axis = 1, how = "all").shape

In [None]:
titanic.dropna(axis = 0, thresh = 8).shape

In [None]:
titanic.dropna(axis = 1, thresh = 500).shape

In [None]:
titanic.dropna(axis = 1, thresh = 500, inplace = True)

In [None]:
titanic.head()

In [None]:
titanic.shape

In [None]:
titanic.dropna(axis = 0, subset = ["Survived", "Class", "Gender", "Age"], how = "any").shape

#### Olympic Dataset

In [None]:
summer.head()

In [None]:
summer.info()

In [None]:
summer[summer.isna().any(axis = 1)]

In [None]:
summer.dropna(inplace = True)

In [None]:
summer.info()

### Replacing Missing Values with fillna()

#### Titanic Dataset

In [None]:
titanic.head(10)

In [None]:
titanic.info()

In [None]:
titanic.Age.mean()

In [None]:
mean = round(titanic.Age.mean(),1)
mean

In [None]:
titanic.Age.fillna(mean, inplace = True)

In [None]:
titanic.head(6)

In [None]:
titanic.info()

### Detection of Duplicates

In [None]:
alphabet = pd.DataFrame(["a", "b", "c", "c", "d", "e", "f", "g", "g", "g"], columns = ["Alphabet"])

In [None]:
alphabet

In [None]:
alphabet.duplicated(keep = "first")

In [None]:
alphabet[alphabet.duplicated(keep = "first")]

#### Titanic Dataset

In [None]:
titanic.head()

In [None]:
titanic.info()

In [None]:
titanic.duplicated(keep = "first", subset = ["Survived", "Class"]).sum()

In [None]:
titanic[titanic.duplicated(keep = False)]

#### Olypmic Dataset

In [None]:
summer.head()

In [None]:
summer.info()

In [None]:
summer.duplicated(keep = "first").sum()

In [None]:
summer[summer.duplicated(keep = False)]

In [None]:
summer.loc[(summer.Sport == "Basketball") & (summer.Year == 2012)]

### Handling / Removing Duplicates

#### Titanic Dataset

In [None]:
titanic.tail()

In [None]:
titanic.duplicated().sum()

In [None]:
titanic[titanic.duplicated()]

In [None]:
titanic.drop(index = [891, 892, 893], inplace = True)

In [None]:
titanic.head()

In [None]:
titanic.tail()

In [None]:
titanic.info()

#### Olympic Dataset

In [None]:
summer.head()

In [None]:
summer[summer.duplicated(keep = False)]

In [None]:
summer.drop(index = [2069, 12253, 15596, 21833, 28678], inplace = True)

In [None]:
summer[summer.duplicated(keep = False)]

In [None]:
summer.loc[16085:16110]

In [None]:
summer.loc[29780:29795]

In [None]:
alphabet[alphabet.duplicated(keep = False)]

In [None]:
alphabet.drop_duplicates(inplace = True)

In [None]:
alphabet

### The ignore_index parameter (NEW in Pandas 1.0)

In [None]:
import pandas as pd

In [None]:
alphabet = pd.DataFrame(["a", "b", "c", "c", "d", "e", "f", "g", "g", "g"], columns = ["Alphabet"])

In [None]:
alphabet

In [None]:
alphabet.drop_duplicates(ignore_index= True)

### Detection of Outliers

In [None]:
titanic.head()

In [None]:
titanic.describe()

In [None]:
plt.figure(figsize = (12,6))
titanic.boxplot("Age")
plt.show()

In [None]:
plt.figure(figsize = (12,6))
titanic.Age.plot()
plt.show()

In [None]:
titanic.Age.sort_values(ascending = False)

In [None]:
titanic.loc[titanic.Age > 90]

In [None]:
titanic.Fare.sort_values(ascending = False)

In [None]:
plt.figure(figsize = (12,6))
titanic.Fare.plot()
plt.show()

### Handling / Removing Outliers

#### Titanic Dataset

In [None]:
titanic.head()

In [None]:
titanic.loc[titanic.Age > 90]

In [None]:
index_outl  = titanic.loc[titanic.Age > 90].index

In [None]:
index_outl

In [None]:
titanic.loc[titanic.Age > 90, "Age"] = titanic.loc[titanic.Age > 90, "Age"]/10

In [None]:
titanic.loc[index_outl]

In [None]:
titanic.loc[217, "Age"] = 42.0

In [None]:
plt.figure(figsize = (12,6))
titanic.Age.plot()
plt.show()

In [None]:
titanic.info()

### Categorical Data

#### Titanic Dataset

In [None]:
titanic.head()

In [None]:
titanic.info()

In [None]:
#titanic.to_csv("titanic_clean.csv", index = False)

In [None]:
titanic.nunique()

In [None]:
titanic[["Gender", "Emb"]].describe()

In [None]:
titanic.Gender = titanic.Gender.astype("category")

In [None]:
titanic.Emb = titanic.Emb.astype("category")

In [None]:
titanic.info()

In [None]:
titanic.Gender.dtype

#### Olympic Dataset

In [None]:
summer.head()

In [None]:
summer.info()

In [None]:
#summer.to_csv("summer_clean.csv", index = False)

In [None]:
summer.describe(include = ["O"])

In [None]:
summer.nunique()

In [None]:
summer.City = summer.City.astype("category")

In [None]:
summer.Sport = summer.Sport.astype("category")

In [None]:
summer.Discipline = summer.Discipline.astype("category")

In [None]:
summer.Country = summer.Country.astype("category")

In [None]:
summer.Gender = summer.Gender.astype("category")

In [None]:
summer.Medal = summer.Medal.astype("category")

In [None]:
summer.info()

### Pandas Version 1.0: NEW Dtypes and pd.NA 

In [None]:
import pandas as pd

In [None]:
titanic = pd.read_csv("titanic.csv")

In [None]:
titanic

In [None]:
titanic.info()

In [None]:
titanic = titanic.convert_dtypes()

In [None]:
titanic

In [None]:
titanic.info()

In [None]:
titanic.iloc[0, -1]

In [None]:
type(titanic.iloc[0, -1])

In [None]:
pd.NA