# Fillter

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [14]:
filepath = '../../data/raw/df.xlsx'
df = pd.read_excel(filepath)

# 5年ごとにする
df = df[df['years'] % 5 == 0]

# 遠すぎる島を除外
df = df[df['distance_m'] <= 1991]

# 人口変動率を追加
df['population_change_rate'] = df.groupby('island_ids')['populations'].pct_change(fill_method=None) * 100

df['control_group'] = 1 - df['treatment_group']

# 経過年数を追加
df['passage_of_year'] = np.where(df['treatment_group'] == 1, df['years'] - df['year_bridges_opened'], 0)

df.dropna(subset='population_change_rate', inplace=True)

# 同じisland_id内でafter_treatment == 0がないものを除外
# already_treated = df.groupby('island_ids').filter(lambda x: (x['after_treated'] == 0).sum() == 0)
# df = df[~df['island_ids'].isin(already_treated['island_ids'])]

# 観測期間が短い島を除外
missing_1975_islands = df[df['years'] == 1975]['island_ids'].unique()
all_islands = df['island_ids'].unique()
islands_without_1975 = [island for island in all_islands if island not in missing_1975_islands]
print(islands_without_1975)
df = df[~df['island_ids'].isin(islands_without_1975)]

[67, 72, 73, 91, 99, 106, 113, 114, 115, 116, 119, 132, 138, 139, 143, 144, 145, 153, 155, 156, 157, 160, 162, 165, 170, 206, 221, 223, 226]


In [15]:
df.drop(columns=['region_names', 'prefecture_names', 'island_names'], inplace=True)
df = df[['island_ids'] + [col for col in df.columns if col != 'island_ids']]

In [16]:
df

Unnamed: 0,island_ids,years,region_codes,prefecture_codes,populations,treatment_group,treatment_group_mainland,year_bridges_opened,after_treated,year_connect_mainland,after_treated_mainland,income,area_km2,distance_m,population_change_rate,control_group,passage_of_year
131,68,1975,33342,33,46.0,1,1,2015,0,2015,0,,10.14,728.12,-22.033898,0,-40
132,68,1980,33342,33,52.0,1,1,2015,0,2015,0,,10.14,728.12,13.043478,0,-35
133,68,1985,33342,33,53.0,1,1,2015,0,2015,0,8.200295e+06,10.14,728.12,1.923077,0,-30
134,68,1990,33342,33,54.0,1,1,2015,0,2015,0,9.799848e+06,10.14,728.12,1.886792,0,-25
135,68,1995,33342,33,45.0,1,1,2015,0,2015,0,1.244189e+03,10.14,728.12,-16.666667,0,-20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3360,218,2000,38203,38,451.0,0,0,0,0,0,0,6.966687e+03,2.76,1090.00,-11.394892,1,0
3365,218,2005,38203,38,473.0,0,0,0,0,0,0,6.264367e+03,2.76,1090.00,4.878049,1,0
3370,218,2010,38203,38,414.0,0,0,0,0,0,0,7.925102e+03,2.76,1090.00,-12.473573,1,0
3375,218,2015,38203,38,344.0,0,0,0,0,0,0,7.234493e+03,2.76,1090.00,-16.908213,1,0


In [17]:
# Cohort
cohort_list = df['year_bridges_opened'].unique()
cohort_list.sort()
cohort_list

array([   0, 1951, 1961, 1967, 1972, 1973, 1975, 1976, 1979, 1983, 1987,
       1988, 1989, 1991, 1992, 1995, 1996, 1999, 2000, 2004, 2011, 2015,
       2016])

In [19]:
def create_cohort(cohort_year):
    df.loc[df['year_bridges_opened'] == cohort_year, f'cohort_{cohort_year}'] = 1
    df.loc[df['year_bridges_opened'] != cohort_year, f'cohort_{cohort_year}'] = 0
    df.drop(columns='cohort_0')
    df.loc[df[f'cohort_{cohort_year}'] == 1, f'years_cohort_{cohort_year}'] = df['passage_of_year'] * df[f'cohort_{cohort_year}']
    df.loc[df[f'cohort_{cohort_year}'] == 0, f'years_cohort_{cohort_year}'] = 0

In [21]:
for cohort_year in cohort_list:
    create_cohort(cohort_year)

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 627 entries, 131 to 3380
Data columns (total 63 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   island_ids                627 non-null    int64  
 1   years                     627 non-null    int64  
 2   region_codes              627 non-null    object 
 3   prefecture_codes          627 non-null    int64  
 4   populations               627 non-null    float64
 5   treatment_group           627 non-null    int64  
 6   treatment_group_mainland  627 non-null    int64  
 7   year_bridges_opened       627 non-null    int64  
 8   after_treated             627 non-null    int64  
 9   year_connect_mainland     627 non-null    int64  
 10  after_treated_mainland    627 non-null    int64  
 11  income                    387 non-null    float64
 12  area_km2                  627 non-null    float64
 13  distance_m                627 non-null    float64
 14  population_c

In [23]:
df_filtered = df
export_filepath = '../../data/processed/df_filtered.xlsx'
df_filtered.to_excel(export_filepath, index=False)