In [11]:
# ライブラリのインポート
import numpy as np
import pandas as pd

In [12]:
# 「計算速度を計測する関数」を定義
import time
def measure_time(start_time=''):
    if start_time == '':
        return time.time()
    else:
        return time.time() - start_time

# 「計算速度を計測する関数」のテスト
is_debug = True
if is_debug:
    start_time = measure_time()
    time.sleep(3)
    duration_time = measure_time(start_time)
    print(duration_time)

3.0031070709228516


In [13]:
# サンプルデータの読み込み
data_row = 1000000
raw_np = np.random.randint(-100, 100, (data_row, 3)) # -100から100までの範囲のランダムな値が入った1000000行3列の配列を生成する。
raw_df = pd.DataFrame(data=raw_np, columns=['column_1', 'column_2', 'column_3'])  # DataFrameに格納する。
print('raw_df')
display(raw_df) #raw_dfを表示する。

raw_df


Unnamed: 0,column_1,column_2,column_3
0,15,31,-9
1,-6,-84,-99
2,4,88,-97
3,-26,56,-40
4,81,-64,75
...,...,...,...
999995,-66,88,-85
999996,24,-92,-18
999997,-11,-47,-50
999998,-28,-1,-44


# Min-Max Normalization
$$
  x' = \frac{x - min(x)}{max(x) - min(x)}
$$

In [14]:
# 最小値と最大値を事前に計算
min_column_3 = min(raw_df.column_3)
max_column_3 = max(raw_df.column_3)

In [15]:
# 全ての計測結果の比較のためのDataFrameを用意
results_columns = ['手法', '全体の計算時間（s）', '1ループあたりの計算時間（ms）', '条件（loop）', '条件（append）', '条件（other）', '説明']
results_df = pd.DataFrame(data=[], columns=results_columns)

def add_results(_results_df, _method_name, _duration_time, _desc, _loop_num, _condition_loop='-', _condition_append='-', _condition_other='-'):
  return results_df.append({
      results_columns[0]: _method_name, 
      results_columns[1]: _duration_time, 
      results_columns[2]: (_duration_time / _loop_num)*1000, 
      results_columns[3]: _condition_loop,
      results_columns[4]: _condition_append,
      results_columns[5]: _condition_other,
      results_columns[6]: _desc,
    }, ignore_index=True)

In [16]:
# 1. iterrowsを使った書き方（numpy配列にappend）
method_name = '01_iterrows_numpy_array'
desc = 'iterrowsを使った書き方（numpy配列にappend）'
condition_loop = 'iterrows'
condition_append = 'numpy_array'
condition_other = '-'

temp_df = raw_df.copy()
start_time = measure_time()

# =====
norm_column_3_array = np.array([])
for index, row in temp_df.iterrows():
  norm_column_3 = (row.column_3 - min_column_3) / (max_column_3 - min_column_3)
  norm_column_3_array = np.append(norm_column_3_array, norm_column_3)

temp_df['columns_3_norm'] = norm_column_3_array
# =====

# 計測時間の確認
duration_time = measure_time(start_time)
print(f'{method_name}: {duration_time} sec')
results_df = add_results(results_df, method_name, duration_time, desc, data_row, _condition_loop=condition_loop, _condition_append=condition_append, _condition_other=condition_other)

# 計算結果の確認
display(temp_df)
del temp_df

01_iterrows_numpy_array: 1050.1790993213654 sec


Unnamed: 0,column_1,column_2,column_3,columns_3_norm
0,15,31,-9,0.457286
1,-6,-84,-99,0.005025
2,4,88,-97,0.015075
3,-26,56,-40,0.301508
4,81,-64,75,0.879397
...,...,...,...,...
999995,-66,88,-85,0.075377
999996,24,-92,-18,0.412060
999997,-11,-47,-50,0.251256
999998,-28,-1,-44,0.281407


In [17]:
# 2. iterrowsを使った書き方（pythonのlistにappend）
method_name = '02_iterrows_list'
desc = 'iterrowsを使った書き方（pythonのlistにappend）'
condition_loop = 'iterrows'
condition_append = 'list'
condition_other = '-'

temp_df = raw_df.copy()
start_time = measure_time()

# =====
norm_column_3_list = []
for index, row in temp_df.iterrows():
  norm_column_3 = (row.column_3 - min_column_3) / (max_column_3 - min_column_3)
  norm_column_3_list.append(norm_column_3)

temp_df['columns_3_norm'] = norm_column_3_list
# =====

# 計測時間の確認
duration_time = measure_time(start_time)
print(f'{method_name}: {duration_time} sec')
results_df = add_results(results_df, method_name, duration_time, desc, data_row, _condition_loop=condition_loop, _condition_append=condition_append, _condition_other=condition_other)

# 計算結果の確認
display(temp_df)
del temp_df

02_iterrows_list: 74.43075776100159 sec


Unnamed: 0,column_1,column_2,column_3,columns_3_norm
0,15,31,-9,0.457286
1,-6,-84,-99,0.005025
2,4,88,-97,0.015075
3,-26,56,-40,0.301508
4,81,-64,75,0.879397
...,...,...,...,...
999995,-66,88,-85,0.075377
999996,24,-92,-18,0.412060
999997,-11,-47,-50,0.251256
999998,-28,-1,-44,0.281407


In [18]:
# 3. 事前にnumpy配列に格納する書き方
method_name = '03_pre_store'
desc = '事前にnumpy配列に格納する書き方（pythonのlistにappend）'
condition_loop = 'range'
condition_append = 'list'
condition_other = '-'

temp_df = raw_df.copy()
start_time = measure_time()

# =====
norm_column_3_list = []
columns_3_array = temp_df.column_3
for index in range(temp_df.shape[0]):
  temp = columns_3_array[index]
  norm_column_3 = (temp - min_column_3) / (max_column_3 - min_column_3)
  norm_column_3_list.append(norm_column_3)

temp_df['columns_3_norm'] = norm_column_3_list
# =====

# 計測時間の確認
duration_time = measure_time(start_time)
print(f'{method_name}: {duration_time} sec')
results_df = add_results(results_df, method_name, duration_time, desc, data_row, _condition_loop=condition_loop, _condition_append=condition_append, _condition_other=condition_other)

# 計算結果の確認
display(temp_df)
del temp_df

03_pre_store: 4.97247314453125 sec


Unnamed: 0,column_1,column_2,column_3,columns_3_norm
0,15,31,-9,0.457286
1,-6,-84,-99,0.005025
2,4,88,-97,0.015075
3,-26,56,-40,0.301508
4,81,-64,75,0.879397
...,...,...,...,...
999995,-66,88,-85,0.075377
999996,24,-92,-18,0.412060
999997,-11,-47,-50,0.251256
999998,-28,-1,-44,0.281407


In [19]:
# 4. 事前にnumpy配列に格納する書き方（事前にgetattr）
method_name = '04_pre_store_getattr'
desc = '事前にnumpy配列に格納する書き方（pythonのlistにappend、事前にgetattr）'
condition_loop = 'range'
condition_append = 'list'
condition_other = 'pre_getattr'

temp_df = raw_df.copy()
start_time = measure_time()

# =====
norm_column_3_list = []
columns_3_array = temp_df.column_3.values # ここに.valuesを追加
for index in range(temp_df.shape[0]):
  temp = columns_3_array[index]
  norm_column_3 = (temp - min_column_3) / (max_column_3 - min_column_3)
  norm_column_3_list.append(norm_column_3)

temp_df['columns_3_norm'] = norm_column_3_list
# =====

# 計測時間の確認
duration_time = measure_time(start_time)
print(f'{method_name}: {duration_time} sec')
results_df = add_results(results_df, method_name, duration_time, desc, data_row, _condition_loop=condition_loop, _condition_append=condition_append, _condition_other=condition_other)

# 計算結果の確認
display(temp_df)
del temp_df

04_pre_store_getattr: 1.1823489665985107 sec


Unnamed: 0,column_1,column_2,column_3,columns_3_norm
0,15,31,-9,0.457286
1,-6,-84,-99,0.005025
2,4,88,-97,0.015075
3,-26,56,-40,0.301508
4,81,-64,75,0.879397
...,...,...,...,...
999995,-66,88,-85,0.075377
999996,24,-92,-18,0.412060
999997,-11,-47,-50,0.251256
999998,-28,-1,-44,0.281407


In [20]:
# 全結果の比較
display(results_df)

Unnamed: 0,手法,全体の計算時間（s）,1ループあたりの計算時間（ms）,条件（loop）,条件（append）,条件（other）,説明
0,01_iterrows_numpy_array,1050.179099,1.050179,iterrows,numpy_array,-,iterrowsを使った書き方（numpy配列にappend）
1,02_iterrows_list,74.430758,0.074431,iterrows,list,-,iterrowsを使った書き方（pythonのlistにappend）
2,03_pre_store,4.972473,0.004972,range,list,-,事前にnumpy配列に格納する書き方（pythonのlistにappend）
3,04_pre_store_getattr,1.182349,0.001182,range,list,pre_getattr,事前にnumpy配列に格納する書き方（pythonのlistにappend、事前にgetattr）
