## Create Series of categorical type

In [None]:
import pandas as pd
import numpy as np

### Create a Series of Categoriy Type
list_of_occupations = ['Data Scientist', 'Data Analyst', 'Data Scientist', 'Statistician', 'Data Scientist', 'Data Scientist', 'Statistician', 'Data Scientist', 'Data Analyst', 'Data Analyst', 'Data Scientist', 'Data Scientist', 'Data Scientist']
series1 = pd.Series(list_of_occupations) # converting list to series
print("series1, data type:", series1.dtype) # Check data type - object
print("series1, number of bytes:", series1.nbytes) # Check no. of bytes

# Create "Nominal" Category
series2 = pd.Series(list_of_occupations, dtype="category") # Create a Series, "category" dtype.
print("series2 data type:", series2.dtype)
print("series2 number of bytes:", series2.nbytes) # No. of bytes are low, as compared to series1

# Create "Ordinal" Category
medals_won = ['Silver', 'Silver', 'Bronze', 'Silver', 'Gold', 'Silver', 'Silver', 'Gold', 'Gold', 'Bronze', 'Bronze', 'Gold', 'Silver']
medals = pd.Categorical(medals_won, categories=["Bronze", "Silver", "Gold"], ordered=True)
print("categories: ", medals.categories) # Print the list of categories
print("medals: ", medals)

## Change data type while importing dataset

In [None]:
updated_dtypes = { "who": "category", "class": "category", "sex": "category" } # dictionary to change data types
titanic = pd.read_csv('/home/tk-lpt-648/my-work/Personal/Projects/python/00_datasets/titanic.csv', dtype=updated_dtypes) # Update dtypes while importing csv
print(titanic.dtypes)

## Create Ordinal category using Binning

`Binning creates ordinal category. i.e; kid < young < elder`

In [None]:
bins = [0, 16, 40, np.inf]  # Define the bins - np.inf means infinity(till last value)
labels = ['kid', 'young', 'elder']  # Define the labels for the bins
titanic['age_group'] = pd.cut(titanic['age'], bins=bins, labels=labels, right=False)

print(titanic['age_group'].value_counts(dropna=False)) # Frequency of categories, including missing categories also

## Add new categories

In [None]:
titanic['age_group'] = titanic['age_group'].astype("category") # Change type of age_group
print("dtype of age_group: ", titanic['age_group'].dtype)

# Add new categories
new_categories = ["unknown", "other"]
titanic["age_group"] = titanic["age_group"].cat.add_categories(new_categories=new_categories)
print(titanic["age_group"].value_counts(dropna=False))
print('---------------------------------------------')

# replace missing values in age_group with 'unknown'
titanic.loc[titanic['age_group'].isna(), 'age_group'] = 'unknown'
print(titanic["age_group"].value_counts(dropna=False))


## Drop a category

In [None]:
print("Unique Categories: ", titanic["age_group"].unique()) # list unique categories from dataset - it do not show category with no value in our dataset
print("All available Categories: ", titanic["age_group"].cat.categories) # list available categories

titanic["age_group"] = titanic["age_group"].cat.remove_categories(removals=["other"]) # Drop "other" category that we added mistakenly
print(titanic["age_group"].value_counts(dropna=False))

## Rename a category

`Upon renaming, 'category' type of Series got converted to 'object'`

In [None]:
# Create the my_changes dictionary
my_changes = {"unknown": "Unknown", "young": "Young", "elder": "Elder", "kid": "Kid"}
titanic["age_group"] = titanic["age_group"].cat.rename_categories(new_categories=my_changes) # Rename the categories
# titanic["age_group"] =  titanic["age_group"].cat.rename_categories(lambda c: c.upper()) # same result as above
# titanic["age_group"] =  titanic["age_group"].str.upper() # same result as above

print(titanic["age_group"].cat.categories) # Print the list of categories

### Rename a category using map

In [None]:
mapping = {
    "Unknown": "unknown",
    "Young": "young",
    "Elder": "elder",
    "Kid": "kid",
}
titanic["age_group"] = titanic["age_group"].map(mapping) # Alterna
print(titanic["age_group"].cat.categories)

## Reorder Categories

Make `Ordinal` categories

In [None]:
# Reorder the categories, specifying the Series is ordinal, and overwriting the original series
titanic["age_group"] = titanic["age_group"].cat.reorder_categories(
  new_categories=["unknown", "kid", "young", "elder"],
  ordered=True,
)

print(titanic["age_group"].cat.categories)
print(titanic["age_group"].unique())

print(titanic.groupby("age_group")["sex"].value_counts()) # age_group index is giving ordered categories, that we mentioned above

## Filtering categorical variables

Using `.str` convert type of column to object. So After using `.str`, always make sure to change type back to `category`

In [None]:
print("Select One: ", titanic.loc[501, "age_group"]) # value at index 501 from column age_group
print("Frequency: ", titanic.loc[titanic['age_group'] == 'young', 'sex'].value_counts(), sep='\n',) # Count young from age_group, based on sex.
print("Mean: ", titanic.loc[titanic['age_group'] == "young", "age"].mean()) # mean of age, of only young people
print(titanic.shape)
print("Like Matching: ", titanic[titanic["age_group"].str.contains("yo", regex=False)].shape) # 'contains' work here like include(from js) for string