In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
np.set_printoptions(precision=3, suppress=True)

print(pd.__version__)

1.3.4


# 1. Text

## 1.1. Tạo dữ liệu string trong Pandas

In [2]:
pd.Series(['Hoàng', 'Nam', 'Thắng', 'Vân'])

0    Hoàng
1      Nam
2    Thắng
3      Vân
dtype: object

In [3]:
pd.Series(['Hoàng', 'Nam', 'Thắng', 'Vân'], dtype='string')

0    Hoàng
1      Nam
2    Thắng
3      Vân
dtype: string

In [4]:
pd.Series(['Hoàng', 'Nam', 'Thắng', 'Vân'], dtype=pd.StringDtype())

0    Hoàng
1      Nam
2    Thắng
3      Vân
dtype: string

In [6]:
ds_hv = pd.Series(['Hoàng', 'Nam', 'Thắng', np.nan, 'Vân']).astype('string')
ds_hv

0    Hoàng
1      Nam
2    Thắng
3     <NA>
4      Vân
dtype: string

## 1.2. Một số thao tác trên string

In [15]:
ds_hv.str.count('Nam')

0       0
1       1
2       0
3    <NA>
4       0
dtype: Int64

In [16]:
ds_hv.str.upper()

0    HOÀNG
1      NAM
2    THẮNG
3     <NA>
4      VÂN
dtype: string

In [17]:
ds_hv.str.lower()

0    hoàng
1      nam
2    thắng
3     <NA>
4      vân
dtype: string

**Bỏ các item khuyết**

In [8]:
ds_hv = pd.Series(['Hoàng  Minh', 'Văn-Nam', 'Quốc_Thắng', np.nan, '  Hồng Vân']).astype('string')
ds_hv

0    Hoàng  Minh
1        Văn-Nam
2     Quốc_Thắng
3           <NA>
4       Hồng Vân
dtype: string

In [9]:
ds_hv.dropna()

0    Hoàng  Minh
1        Văn-Nam
2     Quốc_Thắng
4       Hồng Vân
dtype: string

**Thay thế ký tự/chuỗi con**

In [32]:
ds_hv.dropna().str.strip().str.lower().str.replace('-', ' ').str.replace('_', ' ').str.replace('  ', ' ')

0    hoàng minh
1       văn nam
2    quốc thắng
4      hồng vân
dtype: string

# 2. Datetime

In [10]:
dates = pd.date_range("20211001", periods=6)
dates

DatetimeIndex(['2021-10-01', '2021-10-02', '2021-10-03', '2021-10-04',
               '2021-10-05', '2021-10-06'],
              dtype='datetime64[ns]', freq='D')

In [11]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2021-10-01,1.46951,0.111165,-0.777965,0.448555
2021-10-02,0.754355,-2.109391,0.358638,0.912181
2021-10-03,-1.772081,1.502868,-0.543024,-1.886569
2021-10-04,1.654469,0.42228,0.270141,-0.788358
2021-10-05,-1.471165,1.602458,-0.100699,-0.010625
2021-10-06,1.207255,0.055466,-0.564779,-0.132038


**Một số cách chuyển đổi từ text và dữ liệu số sang datetime**

In [12]:
df2 = pd.DataFrame({'date': ['3/10/2000', '3/11/2000', '3/12/2000'],
                   'value': [2, 3, 4]})
df2

Unnamed: 0,date,value
0,3/10/2000,2
1,3/11/2000,3
2,3/12/2000,4


In [13]:
df2['date'] = pd.to_datetime(df2['date'], dayfirst=True)
df2

Unnamed: 0,date,value
0,2000-10-03,2
1,2000-11-03,3
2,2000-12-03,4


In [14]:
df3 = pd.DataFrame({'date': ['2016-6-10 20:30:0', 
                            '2016-7-1 19:45:30', 
                            '2013-10-12 4:5:1'],
                   'value': [2, 3, 4]})
df3['date'] = pd.to_datetime(df3['date'], format="%Y-%d-%m %H:%M:%S")
df3

Unnamed: 0,date,value
0,2016-10-06 20:30:00,2
1,2016-01-07 19:45:30,3
2,2013-12-10 04:05:01,4


In [15]:
df4 = pd.DataFrame({'year': [2015, 2015, 2016],
                    'month': [2, 3, 3],
                    'day': [4, 5, 8],
                    'hour': [12, 15, 11],
                    'minute': [30, 45, 50]})
df4['date'] = pd.to_datetime(df4)
df4

Unnamed: 0,year,month,day,hour,minute,date
0,2015,2,4,12,30,2015-02-04 12:30:00
1,2015,3,5,15,45,2015-03-05 15:45:00
2,2016,3,8,11,50,2016-03-08 11:50:00


**Select các dòng dữ liệu theo thời gian**

In [17]:
df4 = df4.set_index('date')
df4

Unnamed: 0_level_0,year,month,day,hour,minute
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-02-04 12:30:00,2015,2,4,12,30
2015-03-05 15:45:00,2015,3,5,15,45
2016-03-08 11:50:00,2016,3,8,11,50


In [20]:
df4.loc['2015']

Unnamed: 0_level_0,year,month,day,hour,minute
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-02-04 12:30:00,2015,2,4,12,30
2015-03-05 15:45:00,2015,3,5,15,45


# 3. Dữ liệu dạng categorical

In [21]:
sc1 = pd.Series(['very bad', 'good','bad', 'good', 'very good', 'good', 'bad'], dtype='category')
sc1

0     very bad
1         good
2          bad
3         good
4    very good
5         good
6          bad
dtype: category
Categories (4, object): ['bad', 'good', 'very bad', 'very good']

In [22]:
pd.Categorical(['very bad','good','bad', 'good', 'very good', 'good', 'bad'])

['very bad', 'good', 'bad', 'good', 'very good', 'good', 'bad']
Categories (4, object): ['bad', 'good', 'very bad', 'very good']

In [23]:
sc2 = pd.Categorical(['very bad','good','bad', 'good', 'very good', 'good', 'bad'], 
                     categories=['very bad', 'bad', 'good', 'very good'], ordered=True)
sc2

['very bad', 'good', 'bad', 'good', 'very good', 'good', 'bad']
Categories (4, object): ['very bad' < 'bad' < 'good' < 'very good']

**Tự định nghĩa một loại dữ liệu Categorical**

In [63]:
from pandas.api.types import CategoricalDtype
sc3 = pd.Series(['very bad', 'good','bad', 'good', 'very good', 'good', 'bad'])
cat_type = CategoricalDtype(categories=['very bad', 'bad', 'good', 'very good'], ordered=True)
sc3.astype(cat_type)

0     very bad
1         good
2          bad
3         good
4    very good
5         good
6          bad
dtype: category
Categories (4, object): ['very bad' < 'bad' < 'good' < 'very good']

**Sort dữ liệu với chỉ định thứ tự các category**

In [47]:
sc1.sort_values()

2          bad
6          bad
1         good
3         good
5         good
0     very bad
4    very good
dtype: category
Categories (4, object): ['bad', 'good', 'very bad', 'very good']

In [50]:
pd.Series(sc2).sort_values()

0     very bad
2          bad
6          bad
1         good
3         good
5         good
4    very good
dtype: category
Categories (4, object): ['very bad' < 'bad' < 'good' < 'very good']

In [65]:
sc3 = sc3.astype(cat_type)
sc3.sort_values()

0     very bad
2          bad
6          bad
1         good
3         good
5         good
4    very good
dtype: category
Categories (4, object): ['very bad' < 'bad' < 'good' < 'very good']