# Fillter

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from patsy import dmatrix

In [2]:
filepath = '../../data/raw/df.xlsx'
df = pd.read_excel(filepath)

In [3]:
# 遠すぎる島を除外
df = df[df['distance_m'] <= 1991]

# 5年ごとのデータにする
# df = df.query('year %5 == 0')

# 人口対数値を追加
df['log_pop'] = np.log(df['population'])

# 既に介入を受けている島
already_treated = df.groupby('island_ids').filter(lambda x: (x['after_treatment'] == 0).sum() == 0)['island_ids'].unique()
df['is_already_treated'] = np.where(df['island_ids'].isin(already_treated), 1, 0)
df.query('is_already_treated == 1')['bridge_year'].unique()
df = df.query('is_already_treated == 0')

# 個体時間固定効果を考慮した人口対数値
df_ = df.copy()
mean_log_pop_per_island = df_.groupby('island_ids')['log_pop'].transform('mean')
mean_log_pop_per_year = df_.groupby('year')['log_pop'].transform('mean')
mean_log_pop = df_['log_pop'].mean()
df['log_pop_star'] = df['log_pop'] - mean_log_pop_per_island - mean_log_pop_per_year + mean_log_pop

# # 間違っているかもしれない変換人口対数値
# mean_log_pop_per_island = df_.groupby('island_ids')['log_pop'].transform('mean')
# df_['log_pop_star_1'] = df_['log_pop'] - mean_log_pop_per_island
# mean_log_pop_star_per_year 


# 経過年数を追加
df['time_since_treatment'] = np.where(df['treatment_group'] == 1, df['year'] - df['bridge_year'], 0)

# 既に介入を受けているもの
# 同じisland_id内でafter_treatment == 0がないもの
already_treated = df.groupby('island_ids').filter(lambda x: (x['after_treatment'] == 0).sum() == 0)

# 観測期間が短い島を除外
# missing_1975_islands = df[df['years'] == 1975]['island_ids'].unique()
# all_islands = df['island_ids'].unique()
# islands_without_1975 = [island for island in all_islands if island not in missing_1975_islands]
# print(islands_without_1975)
# df = df[~df['island_ids'].isin(islands_without_1975)]

In [4]:
# 地域名，島名，県名，疑似介入年を削除
df.drop(columns=['region_names', 'prefecture_names', 'island_names', 'pseudo_year', 'pseudo_bridge_year'], inplace=True)
df = df[['island_ids'] + [col for col in df.columns if col != 'island_ids']]

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1713 entries, 119 to 3419
Data columns (total 18 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   island_ids                1713 non-null   int64  
 1   year                      1713 non-null   int64  
 2   region_codes              1713 non-null   object 
 3   prefecture_codes          1713 non-null   int64  
 4   population                1615 non-null   float64
 5   treatment_group           1713 non-null   int64  
 6   treatment_group_mainland  1713 non-null   int64  
 7   bridge_year               1713 non-null   int64  
 8   after_treatment           1713 non-null   int64  
 9   connect_mainland_year     1713 non-null   int64  
 10  after_treatment_mainland  1713 non-null   int64  
 11  income                    1317 non-null   float64
 12  area_km2                  1713 non-null   float64
 13  distance_m                1713 non-null   float64
 14  log_pop    

## 保存

In [6]:
df_filtered = df
export_filepath = '../../data/processed/df_filtered.xlsx'
df_filtered.to_excel(export_filepath, index=False)