In [1]:
import numpy as np
import pandas as pd

## pandas

In [22]:
df = pd.DataFrame({ 'A' : 1.,
                    'B' : pd.Timestamp('20130102'),
                    'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                    'D' : np.array([3] * 4,dtype='int32'),
                    'E' : pd.Categorical(["test","train","test","train"]),
                    'F' : 'foo',
                    'G' : ['1000', '', '123', '1234',]})

In [23]:
df

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2013-01-02,1.0,3,test,foo,1000.0
1,1.0,2013-01-02,1.0,3,train,foo,
2,1.0,2013-01-02,1.0,3,test,foo,123.0
3,1.0,2013-01-02,1.0,3,train,foo,1234.0


## NaN処理

In [24]:
# NaNであるか
df.isna()

Unnamed: 0,A,B,C,D,E,F,G
0,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False


In [31]:
# 全項目NaN化
df[df.isna()]

Unnamed: 0,A,B,C,D,E,F,G
0,,NaT,,,,,
1,,NaT,,,,,
2,,NaT,,,,,
3,,NaT,,,,,


In [32]:
# blankがNaN
df[~df.isna()]

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2013-01-02,1.0,3,test,foo,1000.0
1,1.0,2013-01-02,1.0,3,train,foo,
2,1.0,2013-01-02,1.0,3,test,foo,123.0
3,1.0,2013-01-02,1.0,3,train,foo,1234.0


## replace NaN

In [26]:
# EX) df.replace([-1, 999, 1000], np.nan)
df = df.replace('', np.nan)
df

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2013-01-02,1.0,3,test,foo,1000.0
1,1.0,2013-01-02,1.0,3,train,foo,
2,1.0,2013-01-02,1.0,3,test,foo,123.0
3,1.0,2013-01-02,1.0,3,train,foo,1234.0


## NaNを0,000に変換

In [7]:
df[df['G'].isnull()].replace(np.nan, '0,000')

Unnamed: 0,A,B,C,D,E,F,G
1,1.0,2013-01-02,1.0,3,train,foo,0


# 関数処理まとめ

In [37]:
df = pd.DataFrame({ 'A' : 1.,
                    'B' : pd.Timestamp('20130102'),
                    'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                    'D' : np.array([3] * 4,dtype='int32'),
                    'E' : pd.Categorical(["test","train","test","train"]),
                    'F' : 'foo',
                    'G' : ['1000', '', '123', '1234',]})

## 数字xx桁にして返す関数

In [53]:
def rep_digit_num(df, before_col='code', after_col='L_code', digit=5):
    """
	digit桁にした値で返す
	先頭を0埋めにし、(np.str)でキャストしておく.
    csv出力した時に先頭の0が消滅するのを防ぐ
    
    ExSample:
    	123  ==>> 00123
        1234 ==>> 01234
    
    params:
    	df(DataFrame)	: 処理してるデータ
        before_col(str) : 処理対象col
        after_col(str)  : 処理したcol
        digit(int)		: repしたい桁数(default: 5)
    
    return:
    	after_col(np.str)digit桁にした値を返す
    """
    
    df[after_col] = df.apply(lambda row: f"{row[before_col]}".zfill(digit), axis=1).astype(np.str)    
    return df

In [54]:
rep_digit_num(df, before_col='G', after_col='G_rep', digit=5)

Unnamed: 0,A,B,C,D,E,F,G,G_rep
0,1.0,2013-01-02,1.0,3,test,foo,1000.0,1000
1,1.0,2013-01-02,1.0,3,train,foo,,0
2,1.0,2013-01-02,1.0,3,test,foo,123.0,123
3,1.0,2013-01-02,1.0,3,train,foo,1234.0,1234


## 数値表記に変換

In [98]:
def rep_digit_num(df, before_col='code', after_col='L_code', digit=5):
    """
	値段表記に変換する
    
    ExSample:
    	123.0  ==>> 123
        1234.0 ==>> 1,234
        Nan	   ==>> 0,000
    
    params:
    	df(DataFrame)	: 処理してるデータ
        before_col(str) : 処理対象col
        after_col(str)  : 処理したcol
        
    
    return:
    	after_col(np.str)digit桁にした値を返す
    """
    filter_ = df[~df[before_col].isnull()]
    filter_ = filter_[before_col].astype(np.int)
    
    df[after_col] = filter_.apply(lambda row: f"{row:,}".replace('nan', '0,000'))
#     df[after_col] = df.apply(lambda row: f"{row[before_col]:,}", axis=1).replace('nan', '0,000', regex=True)
    
    return df

In [99]:
rep_digit_num(df, before_col='G', after_col='G_L_num')

Unnamed: 0,A,B,C,D,E,F,G,G_rep,G_L_num
0,1.0,2013-01-02,1.0,3,test,foo,1000.0,1000,1000.0
1,1.0,2013-01-02,1.0,3,train,foo,,0,
2,1.0,2013-01-02,1.0,3,test,foo,123.0,123,123.0
3,1.0,2013-01-02,1.0,3,train,foo,1234.0,1234,1234.0


In [14]:
filter_ = df[df['G'].isnull()]

In [15]:
filter_

Unnamed: 0,A,B,C,D,E,F,G,L_code,G_L
1,1.0,2013-01-02,1.0,3,train,foo,,00nan,


In [18]:
filter_ = df[~df['G'].isnull()]
filter_ = filter_['G'].astype(np.int)

In [19]:
filter_

0    1000
2     123
3    1234
Name: G, dtype: int64

### pandasで各列の値をすべてハッシュ化したいときのプログラム

In [25]:
import hashlib

df['hash_G_L'] = df['G_L'].astype('str').apply(lambda n: hashlib.sha256(n.encode()).hexdigest())

In [27]:
df['hash_G_L']

0    66dbae68e4f498ba668b52b481891465277f250cd3cb3c...
1    9b2d5b4678781e53038e91ea5324530a03f27dc1d0e5f6...
2    a665a45920422f9d417e4867efdc4fb8a04a1f3fff1fa0...
3    d403b4da041f732a36c3b078cfd3dc607544f2cb4355e7...
Name: hash_G_L, dtype: object