In [2]:
#ライブラリの読み込み
import numpy as np
import numpy.random as random
import scipy as sp
import pandas as pd
from pandas import Series, DataFrame

%precision 3

#可視化用
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
sns.set()
%matplotlib inline

欠損データの取り扱い

In [3]:
#欠損データの作成

from numpy import nan as NA
random.seed(0)

df = DataFrame(random.rand(10,4))

# NAにする
df.iloc[1,0] = NA
df.iloc[2:3,2] = NA
df.iloc[5:,3] = NA

df

Unnamed: 0,0,1,2,3
0,0.548814,0.715189,0.602763,0.544883
1,,0.645894,0.437587,0.891773
2,0.963663,0.383442,,0.528895
3,0.568045,0.925597,0.071036,0.087129
4,0.020218,0.83262,0.778157,0.870012
5,0.978618,0.799159,0.461479,
6,0.118274,0.639921,0.143353,
7,0.521848,0.414662,0.264556,
8,0.45615,0.568434,0.01879,
9,0.612096,0.616934,0.943748,


In [4]:
#リストワイズ削除：NaNがある行をすべて取り除く
df.dropna()

Unnamed: 0,0,1,2,3
0,0.548814,0.715189,0.602763,0.544883
3,0.568045,0.925597,0.071036,0.087129
4,0.020218,0.83262,0.778157,0.870012


In [5]:
#ペアワイズ削除：欠損している列のデータを無視して利用可能なデータのみを使用
df[[0,1]].dropna()

Unnamed: 0,0,1
0,0.548814,0.715189
2,0.963663,0.383442
3,0.568045,0.925597
4,0.020218,0.83262
5,0.978618,0.799159
6,0.118274,0.639921
7,0.521848,0.414662
8,0.45615,0.568434
9,0.612096,0.616934


In [6]:
#fillnaで埋める：NaNを別の値に置換する
#今回は0で埋める
df.fillna(0)

Unnamed: 0,0,1,2,3
0,0.548814,0.715189,0.602763,0.544883
1,0.0,0.645894,0.437587,0.891773
2,0.963663,0.383442,0.0,0.528895
3,0.568045,0.925597,0.071036,0.087129
4,0.020218,0.83262,0.778157,0.870012
5,0.978618,0.799159,0.461479,0.0
6,0.118274,0.639921,0.143353,0.0
7,0.521848,0.414662,0.264556,0.0
8,0.45615,0.568434,0.01879,0.0
9,0.612096,0.616934,0.943748,0.0


In [7]:
#前の値で埋める(金融の時系列データなどで使用)
df.fillna(method = 'ffill')

Unnamed: 0,0,1,2,3
0,0.548814,0.715189,0.602763,0.544883
1,0.548814,0.645894,0.437587,0.891773
2,0.963663,0.383442,0.437587,0.528895
3,0.568045,0.925597,0.071036,0.087129
4,0.020218,0.83262,0.778157,0.870012
5,0.978618,0.799159,0.461479,0.870012
6,0.118274,0.639921,0.143353,0.870012
7,0.521848,0.414662,0.264556,0.870012
8,0.45615,0.568434,0.01879,0.870012
9,0.612096,0.616934,0.943748,0.870012


In [8]:
#平均値で埋める(平均代入法)
#時系列データの場合は未来情報を含むことがあるので注意

#各カラムの平均値(確認用)
print(df.mean())

#置換
df.fillna(df.mean())

0    0.531970
1    0.654185
2    0.413497
3    0.584539
dtype: float64


Unnamed: 0,0,1,2,3
0,0.548814,0.715189,0.602763,0.544883
1,0.53197,0.645894,0.437587,0.891773
2,0.963663,0.383442,0.413497,0.528895
3,0.568045,0.925597,0.071036,0.087129
4,0.020218,0.83262,0.778157,0.870012
5,0.978618,0.799159,0.461479,0.584539
6,0.118274,0.639921,0.143353,0.584539
7,0.521848,0.414662,0.264556,0.584539
8,0.45615,0.568434,0.01879,0.584539
9,0.612096,0.616934,0.943748,0.584539


異常データの取り扱い

時系列データの取り扱い

In [13]:
import pandas_datareader.data as pdr

In [14]:
start_date = '2001/1/2'
end_date = '2016/12/30'

fx_jpusdata = pdr.DataReader('DEXJPUS','fred',start_date,end_date)

In [15]:
fx_jpusdata.head()

Unnamed: 0_level_0,DEXJPUS
DATE,Unnamed: 1_level_1
2001-01-02,114.73
2001-01-03,114.26
2001-01-04,115.47
2001-01-05,116.19
2001-01-08,115.97


In [16]:
#特定の年月のデータを参照する
fx_jpusdata['2016-04']

Unnamed: 0_level_0,DEXJPUS
DATE,Unnamed: 1_level_1
2016-04-01,112.06
2016-04-04,111.18
2016-04-05,110.26
2016-04-06,109.63
2016-04-07,107.98
2016-04-08,108.36
2016-04-11,107.96
2016-04-12,108.54
2016-04-13,109.21
2016-04-14,109.2


In [17]:
#月末のレートだけ取り出す
fx_jpusdata.resample('M').last().head()

Unnamed: 0_level_0,DEXJPUS
DATE,Unnamed: 1_level_1
2001-01-31,116.39
2001-02-28,117.28
2001-03-31,125.54
2001-04-30,123.57
2001-05-31,118.88


In [18]:
#欠損がある場合の操作
#欠損有りデータ
fx_jpusdata.resample('D').last().head()

Unnamed: 0_level_0,DEXJPUS
DATE,Unnamed: 1_level_1
2001-01-02,114.73
2001-01-03,114.26
2001-01-04,115.47
2001-01-05,116.19
2001-01-06,


In [19]:
#欠損を前の値で埋める
fx_jpusdata.resample('D').ffill().head()

Unnamed: 0_level_0,DEXJPUS
DATE,Unnamed: 1_level_1
2001-01-02,114.73
2001-01-03,114.26
2001-01-04,115.47
2001-01-05,116.19
2001-01-06,116.19


In [20]:
#データをずらして比率を計算する
fx_jpusdata.shift(1).head()

Unnamed: 0_level_0,DEXJPUS
DATE,Unnamed: 1_level_1
2001-01-02,
2001-01-03,114.73
2001-01-04,114.26
2001-01-05,115.47
2001-01-08,116.19


In [21]:
fx_jpusdata_ratio = fx_jpusdata / fx_jpusdata.shift(1)
fx_jpusdata_ratio.head()

Unnamed: 0_level_0,DEXJPUS
DATE,Unnamed: 1_level_1
2001-01-02,
2001-01-03,0.995903
2001-01-04,1.01059
2001-01-05,1.006235
2001-01-08,0.998107


In [22]:
#移動平均
#３日間の移動平均線を作成する
fx_jpusdata.rolling(3).mean().head()

Unnamed: 0_level_0,DEXJPUS
DATE,Unnamed: 1_level_1
2001-01-02,
2001-01-03,
2001-01-04,114.82
2001-01-05,115.306667
2001-01-08,115.876667


In [23]:
#標準偏差の推移はstdを用いる
fx_jpusdata.rolling(3).std().head()

Unnamed: 0_level_0,DEXJPUS
DATE,Unnamed: 1_level_1
2001-01-02,
2001-01-03,
2001-01-04,0.61
2001-01-05,0.975312
2001-01-08,0.368963
