# Configuring pandas

In [1]:
# import numpy and pandas
import numpy as np
import pandas as pd

# used for dates
import datetime
from datetime import datetime, date

# SEt some pandas options controlling output format
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_column', 8)
pd.set_option('display.max_rows', 10)
pd.set_option('display.width', 65)

# bring in matplotlib for graphics
import matplotlib.pyplot as plt
%matplotlib inline

# Creating Categoricals

In [2]:
# create a categorical directly from a list
lmh_values = ["low", "high", "medium", "medium", "high"]
lmh_cat = pd.Categorical(lmh_values)
lmh_cat

[low, high, medium, medium, high]
Categories (3, object): [high, low, medium]

In [3]:
# examing the categories
lmh_cat.categories

Index(['high', 'low', 'medium'], dtype='object')

In [8]:
# 여기는 실행 에러가 난다 ㅠㅠㅠㅠ 버전문제인가..?
lmh_cat.get_values

AttributeError: 'Categorical' object has no attribute 'get_values'

In [6]:
# .codes shows the integer mapping for each value of the categorical
lmh_cat.codes

array([1, 0, 2, 2, 0], dtype=int8)

In [7]:
# create from list but explicitly state the categories
# 카테고리의 순서를 지정해준다.
lmh_cat1 = pd.Categorical(lmh_values,
                         categories=["low","medium","high"])
lmh_cat1

[low, high, medium, medium, high]
Categories (3, object): [low, medium, high]

In [9]:
# 순서가 쨔쟌 바뀜!
lmh_cat1.codes

array([0, 2, 1, 1, 2], dtype=int8)

In [10]:
lmh_cat.sort_values()

[high, high, low, medium, medium]
Categories (3, object): [high, low, medium]

In [11]:
lmh_cat1.sort_values()

[low, medium, medium, high, high]
Categories (3, object): [low, medium, high]

In [12]:
# create a categorical using a Series and dtype
cat_series = pd.Series(lmh_values, dtype='category')
cat_series

0       low
1      high
2    medium
3    medium
4      high
dtype: category
Categories (3, object): [high, low, medium]

In [13]:
# create a categorical using .astype()
s = pd.Series(lmh_values)
as_cat = s.astype('category')
as_cat

0       low
1      high
2    medium
3    medium
4      high
dtype: category
Categories (3, object): [high, low, medium]

In [14]:
# a categorical has a .cat property that lets you access info
cat_series.cat

<pandas.core.arrays.categorical.CategoricalAccessor object at 0x000002930024FAC8>

In [15]:
# 시리즈 객체의 데이터 접근을 위해서는 accessor를 설정해주어야 한다.
cat_series.cat.categories

Index(['high', 'low', 'medium'], dtype='object')

In [28]:
cat_series

'high'

In [29]:
# create a DataFrame of 100 values
np.random.seed(123456)
values = np.random.randint(0, 100, 5)
bins = pd.DataFrame({'Values':values})
bins

   Values
0      65
1      49
2      56
3      43
4      43

In [30]:
# cut the values into
bins['Group'] = pd.cut(values, range(0, 101, 10)) # cut() 특정 범위에 포함된 객체 구간을 만든다. 순서를 지정한다.
bins

   Values     Group
0      65  (60, 70]
1      49  (40, 50]
2      56  (50, 60]
3      43  (40, 50]
4      43  (40, 50]

In [31]:
bins.Group

0    (60, 70]
1    (40, 50]
2    (50, 60]
3    (40, 50]
4    (40, 50]
Name: Group, dtype: category
Categories (10, interval[int64]): [(0, 10] < (10, 20] < (20, 30] < (30, 40] ... (60, 70] < (70, 80] < (80, 90] < (90, 100]]

In [35]:
# 아하 컷 레인지는 시퀀스 자료형이면 되는구나!
bins['Group'] = pd.cut(values, [0, 20, 40, 70])
bins

   Values     Group
0      65  (40, 70]
1      49  (40, 70]
2      56  (40, 70]
3      43  (40, 70]
4      43  (40, 70]

In [36]:
# create an ordered categorical of precious metals
metal_values = ['bronze', 'gold', 'silver', 'bronze']
metal_categories = ['bronze', 'silver', 'gold']
metals = pd.Categorical(metal_values,
                       categories=metal_categories)
metals

[bronze, gold, silver, bronze]
Categories (3, object): [bronze, silver, gold]

In [37]:
metal_values = ['bronze', 'gold', 'silver', 'bronze']
metal_categories = ['bronze', 'silver', 'gold']
metals = pd.Categorical(metal_values,
                       categories=metal_categories,
                       ordered=True) # categorical 순서
metals

[bronze, gold, silver, bronze]
Categories (3, object): [bronze < silver < gold]

In [38]:
# 그럼 카테고리에 속하지 않는 밸류값이 들어오면 어떻게할까?
# > 그건 나중에 해보자...!

In [39]:
# 역순 정렬
metal_values = ['bronze', 'gold', 'silver', 'bronze'][::-1]
metal_categories = ['bronze', 'silver', 'gold']
reversed_categories = reversed(metal_categories)
metals_r = pd.Categorical(metal_values,
                       categories=metal_categories,
                       ordered=True) # categorical 순서
metals

[bronze, gold, silver, bronze]
Categories (3, object): [gold < silver < bronze]

# Renaming Categories

In [42]:
cat1 = pd.Categorical(['a', 'b', 'c', 'a'],
                    categories=['a', 'b', 'c'])
cat1

[a, b, c, a]
Categories (3, object): [a, b, c]

In [43]:
cat1.categories = ['bronze', 'silver', 'gold']
cat1

[bronze, silver, gold, bronze]
Categories (3, object): [bronze, silver, gold]

In [44]:
cat1

[bronze, silver, gold, bronze]
Categories (3, object): [bronze, silver, gold]

In [48]:
# 원본은 그대로 유지한 채, 카테고리의 속성만 바꾸는 방법. - Rename
# 우오오오 신기하다
cat.rename_categories(['x', 'y', 'z'])

[x, y, z, x]
Categories (3, object): [x, y, z]

In [47]:
cat1

[bronze, silver, gold, bronze]
Categories (3, object): [bronze, silver, gold]

# Appending new categories

In [49]:
# ordered=True로 지정했기 때문에 카테고리가 출력된다.
with_platinum = metals.add_categories(['platinum'])
with_platinum

[bronze, gold, silver, bronze]
Categories (4, object): [gold < silver < bronze < platinum]

In [50]:
with_platinum

[bronze, gold, silver, bronze]
Categories (4, object): [gold < silver < bronze < platinum]

# Removing Categories

In [51]:
no_bronze = metals.remove_categories('bronze')
no_bronze

[NaN, gold, silver, NaN]
Categories (2, object): [gold < silver]

In [53]:
metals.remove_categories('bronze')

[NaN, gold, silver, NaN]
Categories (2, object): [gold < silver]

In [54]:
# 역시 원본은 보존한다.
metals.categories

Index(['gold', 'silver', 'bronze'], dtype='object')

# Removing unused categories

In [55]:
with_platinum.remove_unused_categories() # 미사용 범주 삭제

[bronze, gold, silver, bronze]
Categories (3, object): [gold < silver < bronze]

In [56]:
with_platinum

[bronze, gold, silver, bronze]
Categories (4, object): [gold < silver < bronze < platinum]

# Settings categories

In [57]:
# sample Series
s = pd.Series(['one', 'two', 'four', 'five'], dtype='category')
s

0     one
1     two
2    four
3    five
dtype: category
Categories (4, object): [five, four, one, two]

In [58]:
s = s.cat.set_categories(['one', 'four'])
s

0     one
1     NaN
2    four
3     NaN
dtype: category
Categories (2, object): [one, four]

# Describe

In [59]:
# get descriptive info on the metals categorical
metals.describe()

            counts  freqs
categories               
gold             1   0.25
silver           1   0.25
bronze           2   0.50

In [62]:
metals.categories = reversed(metals.categories)
metals.categories

Index(['bronze', 'silver', 'gold'], dtype='object')

In [63]:
# 아 이렇게하면 bronze였던 카테고리들이 전부 바뀌는구나..! rename을 써야겠다.
metals.describe()

            counts  freqs
categories               
bronze           1   0.25
silver           1   0.25
gold             2   0.50

In [65]:
metals.value_counts()

bronze    1
silver    1
gold      2
dtype: int64

In [99]:
metals

[bronze, gold, silver, bronze]
Categories (3, object): [gold < silver < bronze]

In [98]:
metals.categories

Index(['gold', 'silver', 'bronze'], dtype='object')

In [96]:
r_categories = metals.categories[::-1]
r_categories

Index(['bronze', 'silver', 'gold'], dtype='object')

In [97]:
metals.rename_categories(r_categories)
metals.categories

Index(['gold', 'silver', 'bronze'], dtype='object')

In [105]:
metals.categories = metals.categories
metals.values = metal_values
metals

[bronze, gold, silver, bronze]
Categories (3, object): [gold < silver < bronze]

In [107]:
metals.categories = metals.categories[::-1]
metals

[gold, bronze, silver, gold]
Categories (3, object): [bronze < silver < gold]

# Value counts

In [71]:
# value counts 는 결측치를 빼고 제공한다.
s.value_counts()

four    1
one     1
dtype: int64

# minimum, Maximum and mode

In [83]:
(metals.min(), metals.max(), metals.mode())

('gold',
 'bronze',
 [bronze]
 Categories (3, object): [gold < silver < bronze])

In [108]:
metal_values = ['bronze', 'gold', 'silver', 'bronze']
metal_categories = ['bronze', 'silver', 'gold']
metals = pd.Categorical(metal_values,
                       categories=metal_categories,
                       ordered=True) # categorical 순서
metals

[bronze, gold, silver, bronze]
Categories (3, object): [bronze < silver < gold]

In [110]:
metals

[bronze, gold, silver, bronze]
Categories (3, object): [bronze < silver < gold]

In [111]:
# 역순 정렬
metal_values = ['bronze', 'gold', 'silver', 'bronze'][::-1]
metal_categories = ['bronze', 'silver', 'gold']
reversed_categories = reversed(metal_categories)
metals_r = pd.Categorical(metal_values,
                       categories=metal_categories,
                       ordered=True) # categorical 순서
metals_r

[bronze, silver, gold, bronze]
Categories (3, object): [bronze < silver < gold]

In [112]:
metals <= metals_r

array([ True, False,  True,  True])

In [113]:
metals.codes

array([0, 2, 1, 0], dtype=int8)

In [114]:
metals_r.codes

array([0, 1, 2, 0], dtype=int8)

# Munging school grades

In [118]:
np.random.seed(123456)
names = ['Ivana', 'Norris', 'Ruth', 'Lane', 'Skye', 'Sol',
        'Dylan', 'Katina', 'Alissa', "Marc"]
grades = np.random.randint(50, 101, len(names))
scores = pd.DataFrame({'Name':names, 'Grade':grades})
scores

     Name  Grade
0   Ivana     51
1  Norris     92
2    Ruth    100
3    Lane     99
4    Skye     93
5     Sol     97
6   Dylan     93
7  Katina     77
8  Alissa     82
9    Marc     73

In [119]:
# bins and their mappings to letter grades
score_bins = [0, 59, 62, 66, 69, 72, 76, 79, 82, 86, 89, 92, 99, 100]

letter_grades = ['F-', 'D-', 'D', 'D+', 'C-', 'C', 'C+', 'B-', 'B', 'B+',
                'A-', 'A', 'A+']

In [120]:
# cut based upon the bins and assign the letter grade
letter_cats = pd.cut(scores.Grade, score_bins, labels=letter_grades)
scores['Letter'] = letter_cats
scores

     Name  Grade Letter
0   Ivana     51     F-
1  Norris     92     A-
2    Ruth    100     A+
3    Lane     99      A
4    Skye     93      A
5     Sol     97      A
6   Dylan     93      A
7  Katina     77     C+
8  Alissa     82     B-
9    Marc     73      C

In [121]:
letter_cats

0    F-
1    A-
2    A+
3     A
4     A
5     A
6     A
7    C+
8    B-
9     C
Name: Grade, dtype: category
Categories (13, object): [F- < D- < D < D+ ... B+ < A- < A < A+]

In [122]:
letter_cats.value_counts()

A     4
A+    1
A-    1
B-    1
C+    1
     ..
B     0
C-    0
D+    0
D     0
D-    0
Name: Grade, Length: 13, dtype: int64

In [124]:
scores.sort_values(by=['Letter'], ascending=False)

     Name  Grade Letter
2    Ruth    100     A+
3    Lane     99      A
4    Skye     93      A
5     Sol     97      A
6   Dylan     93      A
1  Norris     92     A-
8  Alissa     82     B-
7  Katina     77     C+
9    Marc     73      C
0   Ivana     51     F-