# Fillter

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from patsy import dmatrix

In [2]:
filepath = '../../data/raw/df.xlsx'
df = pd.read_excel(filepath)

In [3]:
df = df.dropna(subset='population')

# 遠すぎる島を除外
df = df[df['distance_m'] <= 1991]

# 5年ごとのデータにする
df = df.query('year %5 == 0')

# 人口対数値を追加
df['log_pop'] = np.log(df['population'])

# 既に介入を受けている島
already_treated = df.groupby('island_ids').filter(lambda x: (x['after_treatment'] == 0).sum() == 0)['island_ids'].unique()
df['is_already_treated'] = np.where(df['island_ids'].isin(already_treated), 1, 0)
df.query('is_already_treated == 1')['bridge_year'].unique()
df = df.query('is_already_treated == 0')

# 個体時間固定効果を考慮した人口対数値
# df_ = df.copy()
# mean_log_pop_per_island = df_.groupby('island_ids')['log_pop'].transform('mean')
# mean_log_pop_per_year = df_.groupby('year')['log_pop'].transform('mean')
# mean_log_pop = df_['log_pop'].mean()
# df['log_pop_star'] = df['log_pop'] - mean_log_pop_per_island - mean_log_pop_per_year + mean_log_pop

# # 間違っているかもしれない変換人口対数値
# mean_log_pop_per_island = df_.groupby('island_ids')['log_pop'].transform('mean')
# df_['log_pop_star_1'] = df_['log_pop'] - mean_log_pop_per_island
# mean_log_pop_star_per_year 


# 経過年数を追加
df['time_since_treatment'] = np.where(df['treatment_group'] == 1, df['year'] - df['bridge_year'], 0)

# 既に介入を受けているもの
# 同じisland_id内でafter_treatment == 0がないもの
already_treated = df.groupby('island_ids').filter(lambda x: (x['after_treatment'] == 0).sum() == 0)

# 変化率
df['log_pop_diff'] = df.groupby('island_ids')['log_pop'].diff()
df['log_pop_diff'] = df['log_pop_diff'].fillna(0)
df['log_pop_diff_cumsum'] = df.groupby('island_ids')['log_pop_diff'].cumsum()
df['pop_rate_of_change'] = (np.exp(df['log_pop_diff_cumsum']) - 1) * 100

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 794 entries, 124 to 3419
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   island_names              794 non-null    object 
 1   year                      794 non-null    int64  
 2   island_ids                794 non-null    int64  
 3   region_codes              794 non-null    object 
 4   region_names              794 non-null    object 
 5   prefecture_codes          794 non-null    int64  
 6   population                794 non-null    float64
 7   treatment_group           794 non-null    int64  
 8   treatment_group_mainland  794 non-null    int64  
 9   bridge_year               794 non-null    int64  
 10  after_treatment           794 non-null    int64  
 11  connect_mainland_year     794 non-null    int64  
 12  after_treatment_mainland  794 non-null    int64  
 13  income                    484 non-null    float64
 14  prefecture_n

## 保存

In [5]:
df_filtered = df
export_filepath = '../../data/processed/df_filtered.xlsx'
df_filtered.to_excel(export_filepath, index=False)