# Categorical Data in Pandas

## How is a variable translated into categorical structure?

In [1]:
import pandas as pd 
import numpy as np

In [2]:
data=pd.Series(["Tim","Tom","Sam","Sam"]*3)
data

0     Tim
1     Tom
2     Sam
3     Sam
4     Tim
5     Tom
6     Sam
7     Sam
8     Tim
9     Tom
10    Sam
11    Sam
dtype: object

In [3]:
pd.unique(data)

array(['Tim', 'Tom', 'Sam'], dtype=object)

In [4]:
pd.value_counts(data)

Sam    6
Tom    3
Tim    3
dtype: int64

In [5]:
values=pd.Series([0,1,0,0]*3)

In [6]:
names=pd.Series(["Tim","Sam"])
names.take(values)

0    Tim
1    Sam
0    Tim
0    Tim
0    Tim
1    Sam
0    Tim
0    Tim
0    Tim
1    Sam
0    Tim
0    Tim
dtype: object

## Categorical Type in Pandas

In [7]:
data

0     Tim
1     Tom
2     Sam
3     Sam
4     Tim
5     Tom
6     Sam
7     Sam
8     Tim
9     Tom
10    Sam
11    Sam
dtype: object

In [8]:
N=len(data)

In [9]:
df=pd.DataFrame(
    {"name":data,
     "num":np.arange(N),
     "score":np.random.randint(40,100,
                               size=N),
     "weight":np.random.uniform(50,70,
                                size=N)},
    columns=["num","name","score","weight"])

In [10]:
df

Unnamed: 0,num,name,score,weight
0,0,Tim,90,58.608318
1,1,Tom,99,67.616725
2,2,Sam,70,58.181046
3,3,Sam,96,56.833079
4,4,Tim,82,55.952711
5,5,Tom,89,52.296487
6,6,Sam,97,53.203579
7,7,Sam,96,63.967189
8,8,Tim,45,57.324508
9,9,Tom,57,58.393265


In [11]:
df["name"]

0     Tim
1     Tom
2     Sam
3     Sam
4     Tim
5     Tom
6     Sam
7     Sam
8     Tim
9     Tom
10    Sam
11    Sam
Name: name, dtype: object

In [12]:
type(df["name"])

pandas.core.series.Series

In [13]:
name_cat=df["name"].astype("category")
name_cat

0     Tim
1     Tom
2     Sam
3     Sam
4     Tim
5     Tom
6     Sam
7     Sam
8     Tim
9     Tom
10    Sam
11    Sam
Name: name, dtype: category
Categories (3, object): ['Sam', 'Tim', 'Tom']

In [14]:
x=name_cat.values

In [15]:
x.categories

Index(['Sam', 'Tim', 'Tom'], dtype='object')

In [16]:
x.codes

array([1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0], dtype=int8)

In [17]:
df["name"]=df["name"].astype("category")
df.name

0     Tim
1     Tom
2     Sam
3     Sam
4     Tim
5     Tom
6     Sam
7     Sam
8     Tim
9     Tom
10    Sam
11    Sam
Name: name, dtype: category
Categories (3, object): ['Sam', 'Tim', 'Tom']

In [18]:
data_cat=pd.Categorical(list("abcde"))
data_cat

['a', 'b', 'c', 'd', 'e']
Categories (5, object): ['a', 'b', 'c', 'd', 'e']

In [19]:
pd.Categorical(["banana", "apple", 
                "kiwi", "banana", "apple"])

['banana', 'apple', 'kiwi', 'banana', 'apple']
Categories (3, object): ['apple', 'banana', 'kiwi']

In [20]:
people=["baby", "child", "young", "old"]
codes=[0,1,2,3,1,0,0]
people_cat=pd.Categorical.from_codes(
    codes,people)
people_cat

['baby', 'child', 'young', 'old', 'child', 'baby', 'baby']
Categories (4, object): ['baby', 'child', 'young', 'old']

In [21]:
people_cat=pd.Categorical.from_codes(
    codes,people,ordered=True)
people_cat

['baby', 'child', 'young', 'old', 'child', 'baby', 'baby']
Categories (4, object): ['baby' < 'child' < 'young' < 'old']

In [22]:
people_cat.as_ordered()

['baby', 'child', 'young', 'old', 'child', 'baby', 'baby']
Categories (4, object): ['baby' < 'child' < 'young' < 'old']

## Working with Categorical

In [23]:
data=np.random.randn(1000)

In [24]:
interval=pd.qcut(data,4)
interval

[(-2.9739999999999998, -0.668], (0.735, 3.402], (0.735, 3.402], (0.00973, 0.735], (-2.9739999999999998, -0.668], ..., (-0.668, 0.00973], (0.735, 3.402], (-0.668, 0.00973], (0.735, 3.402], (-2.9739999999999998, -0.668]]
Length: 1000
Categories (4, interval[float64]): [(-2.9739999999999998, -0.668] < (-0.668, 0.00973] < (0.00973, 0.735] < (0.735, 3.402]]

In [25]:
type(interval)

pandas.core.arrays.categorical.Categorical

In [26]:
interval=pd.qcut(data,4,labels=["Q1","Q2",
                                "Q3","Q4"])
interval

['Q1', 'Q4', 'Q4', 'Q3', 'Q1', ..., 'Q2', 'Q4', 'Q2', 'Q4', 'Q1']
Length: 1000
Categories (4, object): ['Q1' < 'Q2' < 'Q3' < 'Q4']

In [27]:
interval=pd.Series(interval,name="quarter")

In [28]:
pd.Series(
    data).groupby(
    interval).agg(["count",
                   "min",
                   "max"]).reset_index()

Unnamed: 0,quarter,count,min,max
0,Q1,250,-2.973073,-0.669205
1,Q2,250,-0.667556,0.009061
2,Q3,250,0.010393,0.733632
3,Q4,250,0.737847,3.402463


## 3- How is the performance of categorical types?

In [29]:
N=10000000
num=pd.Series(np.random.randn(N))

In [30]:
label=pd.Series(["a","b","c","d"]*(N//4))

In [31]:
cat=label.astype("category")

In [32]:
label.memory_usage()

80000128

In [33]:
cat.memory_usage()

10000320

## 4- What are categorical methods?

In [34]:
s=pd.Series(["a","b","c","d"]*2)

In [35]:
s_ct=s.astype("category")
s_ct

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']

In [36]:
s_ct.cat.codes

0    0
1    1
2    2
3    3
4    0
5    1
6    2
7    3
dtype: int8

In [37]:
s_ct.cat.categories

Index(['a', 'b', 'c', 'd'], dtype='object')

In [38]:
new_ct=["a","b","c","d","e"]
s_ct.cat.set_categories(new_ct)

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (5, object): ['a', 'b', 'c', 'd', 'e']

In [39]:
s2_ct=s_ct[s_ct.isin(["a","b"])]
s2_ct             

0    a
1    b
4    a
5    b
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']

In [40]:
s2_ct.cat.remove_unused_categories()

0    a
1    b
4    a
5    b
dtype: category
Categories (2, object): ['a', 'b']

## 5- How to create a dummy variable?

In [41]:
s_ct

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']

In [42]:
pd.get_dummies(s_ct)

Unnamed: 0,a,b,c,d
0,1,0,0,0
1,0,1,0,0
2,0,0,1,0
3,0,0,0,1
4,1,0,0,0
5,0,1,0,0
6,0,0,1,0
7,0,0,0,1
