# Fillter

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [12]:
filepath = '../../data/raw/df.xlsx'
df = pd.read_excel(filepath)
# 5年ごとにする
df = df[df['years'] % 5 == 0]
# 遠すぎる島を除外
df = df[df['distance_m'] <= 1991]
# 人口変動率を追加
df['population_change_rate'] = df.groupby('island_ids')['populations'].pct_change(fill_method=None) * 100
# 経過年数を追加
df['passage_of_year'] = np.where(df['treatment_group'] == 1, df['years'] - df['year_bridges_opened'], 0)

df.dropna(subset='population_change_rate', inplace=True)

# 同じisland_id内でafter_treatment == 0がないものを除外
already_treated = df.groupby('island_ids').filter(lambda x: (x['after_treated'] == 0).sum() == 0)
df = df[~df['island_ids'].isin(already_treated['island_ids'])]

# 観測期間が短い島を除外
missing_1975_islands = df[df['years'] == 1975]['island_ids'].unique()
all_islands = df['island_ids'].unique()
islands_without_1975 = [island for island in all_islands if island not in missing_1975_islands]
print(islands_without_1975)
df = df[~df['island_ids'].isin(islands_without_1975)]

[73, 99, 106, 114, 115, 119, 156, 162, 165, 170, 206]


In [13]:
df

Unnamed: 0,island_names,years,island_ids,region_codes,region_names,prefecture_codes,populations,treatment_group,treatment_group_mainland,year_bridges_opened,after_treated,year_connect_mainland,after_treated_mainland,income,prefecture_names,area_km2,distance_m,population_change_rate,passage_of_year
131,鹿久居島,1975,68,33342,日生町,33,46.0,1,1,2015,0,2015,0,,岡山県,10.14,728.12,-22.033898,-40
132,鹿久居島,1980,68,33342,日生町,33,52.0,1,1,2015,0,2015,0,,岡山県,10.14,728.12,13.043478,-35
133,鹿久居島,1985,68,33342,日生町,33,53.0,1,1,2015,0,2015,0,8.200295e+06,岡山県,10.14,728.12,1.923077,-30
134,鹿久居島,1990,68,33342,日生町,33,54.0,1,1,2015,0,2015,0,9.799848e+06,岡山県,10.14,728.12,1.886792,-25
135,鹿久居島,1995,68,33342,日生町,33,45.0,1,1,2015,0,2015,0,1.244189e+03,岡山県,10.14,728.12,-16.666667,-20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3360,戸島,2000,218,38203,宇和島市,38,451.0,0,0,0,0,0,0,6.966687e+03,愛媛県,2.76,1090.00,-11.394892,0
3365,戸島,2005,218,38203,宇和島市,38,473.0,0,0,0,0,0,0,6.264367e+03,愛媛県,2.76,1090.00,4.878049,0
3370,戸島,2010,218,38203,宇和島市,38,414.0,0,0,0,0,0,0,7.925102e+03,愛媛県,2.76,1090.00,-12.473573,0
3375,戸島,2015,218,38203,宇和島市,38,344.0,0,0,0,0,0,0,7.234493e+03,愛媛県,2.76,1090.00,-16.908213,0


In [14]:
# Cohort
cohort_list = df['year_bridges_opened'].unique()
cohort_list.sort()
cohort_list

array([   0, 1973, 1976, 1979, 1983, 1987, 1988, 1989, 1991, 1992, 1995,
       1996, 1999, 2000, 2004, 2011, 2015, 2016])

In [15]:
def create_cohort(cohort_year):
    df.loc[df['year_bridges_opened'] == cohort_year, f'cohort_{cohort_year}'] = 1
    df.loc[df['year_bridges_opened'] != cohort_year, f'cohort_{cohort_year}'] = 0
    df.drop(columns='cohort_0')
    df.loc[df[f'cohort_{cohort_year}'] == 1, f'years_cohort_{cohort_year}'] = df['passage_of_year'] * df[f'cohort_{cohort_year}']
    df.loc[df[f'cohort_{cohort_year}'] == 0, f'years_cohort_{cohort_year}'] = 0

In [16]:
for cohort_year in cohort_list:
    create_cohort(cohort_year)

In [17]:
df

Unnamed: 0,island_names,years,island_ids,region_codes,region_names,prefecture_codes,populations,treatment_group,treatment_group_mainland,year_bridges_opened,...,cohort_2000,years_cohort_2000,cohort_2004,years_cohort_2004,cohort_2011,years_cohort_2011,cohort_2015,years_cohort_2015,cohort_2016,years_cohort_2016
131,鹿久居島,1975,68,33342,日生町,33,46.0,1,1,2015,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-40.0,0.0,0.0
132,鹿久居島,1980,68,33342,日生町,33,52.0,1,1,2015,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-35.0,0.0,0.0
133,鹿久居島,1985,68,33342,日生町,33,53.0,1,1,2015,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-30.0,0.0,0.0
134,鹿久居島,1990,68,33342,日生町,33,54.0,1,1,2015,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-25.0,0.0,0.0
135,鹿久居島,1995,68,33342,日生町,33,45.0,1,1,2015,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-20.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3360,戸島,2000,218,38203,宇和島市,38,451.0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3365,戸島,2005,218,38203,宇和島市,38,473.0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3370,戸島,2010,218,38203,宇和島市,38,414.0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3375,戸島,2015,218,38203,宇和島市,38,344.0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
df_filtered = df
export_filepath = '../../data/processed/df_filtered.xlsx'
df_filtered.to_excel(export_filepath, index=False)