In [1]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/캡스톤/weather.csv")

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
df['season_tag'].unique()

array(['off-season', 'pre-harvest', 'harvest', 'sub_harvest'],
      dtype=object)

In [4]:
df = df.drop(columns=['days_until_harvest'])

In [5]:
df_melted = df.melt(id_vars=["Date", "locationName"], var_name="Indicator", value_name="Value")
df_transformed = df_melted.pivot(index="Date", columns=["locationName", "Indicator"], values="Value").reset_index()

# MultiIndex 컬럼을 단일 컬럼명("지명_지표")으로 변환
df_transformed.columns = ["Date"] + [f"{country}_{indicator}" for country, indicator in df_transformed.columns[1:]]

기후데이터의 결측치 처리

In [6]:
df_transformed.fillna(method='ffill', inplace=True)  # 이전 값으로 대체
df_transformed.fillna(method='bfill', inplace=True)  # 다음 값으로 대체

  df_transformed.fillna(method='ffill', inplace=True)  # 이전 값으로 대체
  df_transformed.fillna(method='ffill', inplace=True)  # 이전 값으로 대체
  df_transformed.fillna(method='bfill', inplace=True)  # 다음 값으로 대체


In [7]:
df_transformed.isna().sum().sum()

np.int64(0)

가장 최근 수확기의 평균 값이 매핑될 수 있도록 함

In [8]:
weather_vars = [col for col in df_transformed.columns if col not in ['Date'] and not col.endswith('season_tag')]

In [9]:
weather_vars

['brazil_varginha_T2M',
 'brazil_carmo_de_minas_T2M',
 'brazil_patrocinio_T2M',
 'ethiopia_limu_T2M',
 'ethiopia_sidamo_T2M',
 'ethiopia_yirgacheffe_T2M',
 'colombia_manizales_T2M',
 'colombia_armenia_T2M',
 'colombia_pereira_T2M',
 'brazil_varginha_WS2M',
 'brazil_carmo_de_minas_WS2M',
 'brazil_patrocinio_WS2M',
 'ethiopia_limu_WS2M',
 'ethiopia_sidamo_WS2M',
 'ethiopia_yirgacheffe_WS2M',
 'colombia_manizales_WS2M',
 'colombia_armenia_WS2M',
 'colombia_pereira_WS2M',
 'brazil_varginha_ALLSKY_SFC_SW_DWN',
 'brazil_carmo_de_minas_ALLSKY_SFC_SW_DWN',
 'brazil_patrocinio_ALLSKY_SFC_SW_DWN',
 'ethiopia_limu_ALLSKY_SFC_SW_DWN',
 'ethiopia_sidamo_ALLSKY_SFC_SW_DWN',
 'ethiopia_yirgacheffe_ALLSKY_SFC_SW_DWN',
 'colombia_manizales_ALLSKY_SFC_SW_DWN',
 'colombia_armenia_ALLSKY_SFC_SW_DWN',
 'colombia_pereira_ALLSKY_SFC_SW_DWN',
 'brazil_varginha_ALLSKY_SFC_UV_INDEX',
 'brazil_carmo_de_minas_ALLSKY_SFC_UV_INDEX',
 'brazil_patrocinio_ALLSKY_SFC_UV_INDEX',
 'ethiopia_limu_ALLSKY_SFC_UV_INDEX',
 'e

In [None]:
season_tag_cols = [col for col in df_transformed.columns if col.endswith('season_tag')]
season_tag_cols

['brazil_varginha_season_tag',
 'brazil_carmo_de_minas_season_tag',
 'brazil_patrocinio_season_tag',
 'ethiopia_limu_season_tag',
 'ethiopia_sidamo_season_tag',
 'ethiopia_yirgacheffe_season_tag',
 'colombia_manizales_season_tag',
 'colombia_armenia_season_tag',
 'colombia_pereira_season_tag']

In [None]:
def find_harvest_blocks(series: pd.Series):
    """연속된 harvest 구간을 블록 단위로 나눔"""
    blocks = []
    in_block = False
    start = None

    for i in range(len(series)):
        if series.iloc[i] == 'harvest':
            if not in_block:
                start = series.index[i]
                in_block = True
        else:
            if in_block:
                end = series.index[i - 1]
                blocks.append((start, end))
                in_block = False

    # 마지막까지 harvest면 마무리
    if in_block:
        end = series.index[-1]
        blocks.append((start, end))
    return blocks

In [None]:
harvest_info = {}

for season_col in season_tag_cols:
    region = season_col.replace('_season_tag', '')
    region_weather_cols = [col for col in weather_vars if col.startswith(region)]

    tag_series = df_transformed.set_index('Date')[season_col]
    blocks = find_harvest_blocks(tag_series)

    for i, (start_date, end_date) in enumerate(blocks):
        block_mask = (df_transformed['Date'] >= start_date) & (df_transformed['Date'] <= end_date)
        block_df_transformed = df_transformed[block_mask]

        for col in region_weather_cols:
            mean_val = block_df_transformed[col].mean()
            key = f"{col}_prev_harvest_mean_{i+1}"
            harvest_info[key] = {
                'mean': mean_val,
                'region': region,
                'variable': col.split(f"{region}_")[1],
                'start_date': start_date,
                'end_date': end_date,
                'original_col': col
            }

In [None]:
from datetime import datetime

def group_harvest_info_by_col(harvest_info: dict):
    grouped = {}
    for key, info in harvest_info.items():
        col = info['original_col']
        if col not in grouped:
            grouped[col] = []
        grouped[col].append(info)

    for col in grouped:
        grouped[col] = sorted(grouped[col], key=lambda x: pd.to_datetime(x['end_date']))
    return grouped

def apply_recent_harvest_mean(df, grouped_info):
    for original_col, blocks in grouped_info.items():
        feature_col = f"{original_col}_harvest_mean"

        def get_recent_mean(current_date):
            current_date = pd.to_datetime(current_date)
            for block in reversed(blocks):  # 가장 최근부터 검사
                if current_date > pd.to_datetime(block['end_date']):
                    return block['mean']
            return None  # 해당 없음 (모든 end_date 이후가 아님)

        df[feature_col] = df['Date'].apply(get_recent_mean)

    return df

In [None]:
grouped = group_harvest_info_by_col(harvest_info)

df_new = apply_recent_harvest_mean(df_transformed, grouped)

  df[feature_col] = df['Date'].apply(get_recent_mean)
  df[feature_col] = df['Date'].apply(get_recent_mean)
  df[feature_col] = df['Date'].apply(get_recent_mean)
  df[feature_col] = df['Date'].apply(get_recent_mean)
  df[feature_col] = df['Date'].apply(get_recent_mean)
  df[feature_col] = df['Date'].apply(get_recent_mean)
  df[feature_col] = df['Date'].apply(get_recent_mean)
  df[feature_col] = df['Date'].apply(get_recent_mean)
  df[feature_col] = df['Date'].apply(get_recent_mean)
  df[feature_col] = df['Date'].apply(get_recent_mean)
  df[feature_col] = df['Date'].apply(get_recent_mean)
  df[feature_col] = df['Date'].apply(get_recent_mean)
  df[feature_col] = df['Date'].apply(get_recent_mean)
  df[feature_col] = df['Date'].apply(get_recent_mean)
  df[feature_col] = df['Date'].apply(get_recent_mean)
  df[feature_col] = df['Date'].apply(get_recent_mean)
  df[feature_col] = df['Date'].apply(get_recent_mean)
  df[feature_col] = df['Date'].apply(get_recent_mean)
  df[feature_col] = df['Date

In [None]:
cols = ['Date'] + [col for col in df_new.columns if col.endswith('_mean')]

In [None]:
df_final = df_new[cols]

In [None]:
df_final = df_final.dropna()

In [None]:
len(df_final)

3462

커피 가격 데이터와 merge

In [None]:
data = pd.read_csv("/content/drive/MyDrive/캡스톤/커피가격데이터통합.csv")

In [None]:
data.fillna(method='ffill', inplace=True)  # 이전 값으로 대체
data.fillna(method='bfill', inplace=True)  # 다음 값으로 대체

  data.fillna(method='ffill', inplace=True)  # 이전 값으로 대체
  data.fillna(method='bfill', inplace=True)  # 다음 값으로 대체


In [None]:
final = pd.merge(data, df_final, on='Date', how='left')

In [None]:
final = final.dropna()

In [None]:
final.to_csv('기후평균커피가격통합데이터.csv', index=False)