In [None]:
import pandas as pd
import sqlite3
import numpy as np

# CSV 파일 경로
csv_file_path = 'C:/Users/82102/Desktop/rain/pyrain/data/rainfall_train1.csv'
# 데이터베이스 파일 경로
db_file_path = 'C:/Users/82102/Desktop/rain/pyrain/data/qwer.db'
# CSV 파일을 DataFrame으로 읽기
df = pd.read_csv(csv_file_path)

# -999 값을 결측치(NaN)로 대체
df['rainfall_train.class_interval'] = df['rainfall_train.class_interval'].replace(-999, np.nan)

# 선형 보간법을 사용하여 결측치를 채우고, 소수점을 반올림
df['rainfall_train.class_interval'] = df['rainfall_train.class_interval'].interpolate(method='linear').round()

# 결측치가 제대로 처리되었는지 확인
class_interval_counts_after = df['rainfall_train.class_interval'].value_counts().sort_index()
print("\n결측치 처리 후 class_interval의 종류와 각 종류의 개수:")
print(class_interval_counts_after)

# 처리된 데이터를 데이터베이스에 다시 저장
conn = sqlite3.connect(db_file_path)
df.to_sql('qwert', conn, if_exists='replace', index=False)
conn.close()

# class_interval 종류별 개수 출력
print("\n결측치 처리 후 종류별 class_interval의 개수:")
print(class_interval_counts_after)

# class_interval 종류별 개수를 데이터프레임으로 변환하여 출력
class_interval_counts_df = class_interval_counts_after.reset_index()
class_interval_counts_df.columns = ['class_interval', 'count']
print("\nclass_interval 종류와 개수를 데이터프레임 형태로 출력:")
print(class_interval_counts_df)


print("\n결측치가 성공적으로 처리되고 데이터베이스에 저장되었습니다.")


In [None]:
기상 데이터에서 결측치는 모델의 패턴 학습에 큰 영향을 미친다. 하지만 결측치를 삭제 할 시에 정보손실과 패턴의 손상을 최소화 하기위해 결측치를 nan으로 대체후 선형보간법을 이용해서 결측치를 처리하였다.

In [None]:
import numpy as np
import pandas as pd
import sqlite3

# 데이터 로드 함수
def load_data(db_path, table_name):
    conn = sqlite3.connect(db_path)
    query = f"SELECT * FROM {table_name}"
    data = pd.read_sql(query, conn)
    conn.close()
    return data

# 파생 변수 생성 함수
def create_features(data):
    # 컬럼명 변경
    data = data.rename(columns={
        'rainfall_train.ef_month': 'ef_month',
        'rainfall_train.fc_hour': 'fc_hour',
        'rainfall_train.ef_hour': 'ef_hour',
        'rainfall_train.fc_day': 'fc_day',
        'rainfall_train.ef_day': 'ef_day',
        'rainfall_train.fc_month' : 'fc_month',
        'rainfall_train.v01':'v01',
        'rainfall_train.v02':'v02',
        'rainfall_train.v03':'v03', 
        'rainfall_train.v04':'v04',
        'rainfall_train.v05':'v05',
        'rainfall_train.v06':'v06', 
        'rainfall_train.v07':'v07',
        'rainfall_train.v08':'v08', 
        'rainfall_train.v09':'v09',
        'rainfall_train.dh' : 'dh'
    })

    # 파생 변수 생성
    data['fc_ef_day_diff'] = data['ef_day'] - data['fc_day']
    data['ef_hour_sin'] = np.sin(2 * np.pi * data['ef_hour'] / 24)
    data['ef_hour_cos'] = np.cos(2 * np.pi * data['ef_hour'] / 24)
    data['fc_ef_day_ratio'] = np.where(data['ef_day'] != 0, data['fc_day'] / data['ef_day'], 0)
    data['fc_ef_hour_diff'] = data['ef_hour'] - data['fc_hour']
    data['fc_ef_hour_ratio'] = np.where(data['ef_hour'] != 0, data['fc_hour'] / data['ef_hour'], 0)
    data['fc_ef_month_diff'] = data['ef_month'] - data['fc_month']
    return data

# 데이터 저장 함수
def save_data_to_db(data, db_path, table_name):
    conn = sqlite3.connect(db_path)
    data.to_sql(table_name, conn, if_exists='replace', index=False)
    conn.close()

# 메인 함수
def main():
    db_path = 'C:/Users/82102/Desktop/rain/pyrain/data/qwer.db'
    table_name = 'qwert'
    
    # 데이터 로드
    print("Loading data...")
    data = load_data(db_path, table_name)
    
    # 파생 변수 생성
    print("Creating features...")
    data = create_features(data)
    
    # 데이터 저장
    print("Saving data to database...")
    save_data_to_db(data, db_path, table_name)
    print("Data saved successfully.")

if __name__ == "__main__":
    main()


In [None]:
이후에 사용할 test코드에서의 변수 통일성을 위해서 rainfall_train의 접두사를 모두 제거하고 파생변수를 생성하였다.
파생변수에 대해서는 추후에 설명하도록한다.

In [None]:
import numpy as np
import pandas as pd
import sqlite3
import pickle
import os
from sklearn.preprocessing import StandardScaler

# 데이터베이스 테이블의 열 이름과 열 수를 확인하는 함수
def get_column_names_and_count(db_path, table_name):
    print("Retrieving column names and count from the database.")
    try:
        conn = sqlite3.connect(db_path)
        query = f"SELECT * FROM {table_name} LIMIT 1"
        data = pd.read_sql(query, conn)
        column_names = data.columns.tolist()
        column_count = len(column_names)
    except Exception as e:
        print(f"Error retrieving column names and count: {e}")
        return None, None
    finally:
        conn.close()
    print("Successfully retrieved column names and count.")
    return column_names, column_count

# 구간 확률 계산 함수
def calculate_segment_probabilities(data):
    segment_prob_1 = data['v01'] - data['v02']
    segment_prob_2 = data['v02'] - data['v03']
    segment_prob_3 = data['v03'] - data['v04']
    segment_prob_4 = data['v04'] - data['v05']
    segment_prob_5 = data['v05'] - data['v06']
    segment_prob_6 = data['v06'] - data['v07']
    segment_prob_7 = data['v07'] - data['v08']
    segment_prob_8 = data['v08'] - data['v09']
    segment_prob_9 = data['v09']
    
    segment_prob_df = pd.DataFrame({
        'segment_prob_1': segment_prob_1,
        'segment_prob_2': segment_prob_2,
        'segment_prob_3': segment_prob_3,
        'segment_prob_4': segment_prob_4,
        'segment_prob_5': segment_prob_5,
        'segment_prob_6': segment_prob_6,
        'segment_prob_7': segment_prob_7,
        'segment_prob_8': segment_prob_8,
        'segment_prob_9': segment_prob_9
    })
    
    # segment_prob_zero 계산
    segment_prob_df['segment_prob_zero'] = 100 - segment_prob_df[['segment_prob_1', 'segment_prob_2', 'segment_prob_3', 'segment_prob_4', 'segment_prob_5', 'segment_prob_6', 'segment_prob_7', 'segment_prob_8', 'segment_prob_9']].sum(axis=1)
    
    return segment_prob_df

# 월별 평균 강수량과 강수 추세 변수를 생성하는 함수
def calculate_monthly_avg_rainfall_and_trend(data):
    # 구간 확률을 계산하여 데이터에 추가
    segment_prob_df = calculate_segment_probabilities(data)
    data = pd.concat([data, segment_prob_df], axis=1)
    
    # sum_segment_probs 계산
    data['sum_segment_probs'] = data[['segment_prob_1', 'segment_prob_2', 'segment_prob_3', 'segment_prob_4', 'segment_prob_5', 'segment_prob_6', 'segment_prob_7', 'segment_prob_8', 'segment_prob_9']].sum(axis=1)
    
    # 월별 평균 강수 확률 계산
    data['monthly_avg_rainfall_prob'] = data.groupby('ef_month')['sum_segment_probs'].transform('mean')
    
    return data

# 다중 임계값을 사용한 이진 변수 생성 함수
def create_multithreshold_binary_features(data, thresholds):
    for threshold in thresholds:
        data[f'is_rain_{threshold}'] = (data['sum_segment_probs'] >= threshold).astype(int)
    return data

# 비 내릴 확률을 측정하는 변수를 생성하는 함수
def create_rain_probability_feature(data, thresholds):
    # 각 임계값에서 생성된 이진 변수들을 합산하여 비 내릴 확률을 측정하는 변수를 생성
    data['rain_probability'] = data[[f'is_rain_{threshold}' for threshold in thresholds]].sum(axis=1)
    # 역수 취하기
    data['rain_probability_inverse'] = 1 / data['rain_probability']
    return data

def create_rain_nonrain_diff(data):
    data['rain_nonrain_diff'] = data.apply(
        lambda row: row['segment_prob_zero'] - row['sum_segment_probs']
        if row['segment_prob_zero'] - row['sum_segment_probs'] > 0
        else 1 / abs(row['segment_prob_zero'] - row['sum_segment_probs']) if row['segment_prob_zero'] - row['sum_segment_probs'] < 0
        else 1,
        axis=1
    )
    return data

def create_combined_metric(data):
    data['rain_combined_metric'] = data['rain_probability_inverse'] * data['rain_nonrain_diff']
    return data

# 중요한 특성 목록 정의
important_columns = [ 
    'segment_prob_1','segment_prob_2','segment_prob_3',
    'segment_prob_4','segment_prob_5','segment_prob_6',
    'segment_prob_7','segment_prob_8','segment_prob_9',
     'ef_hour_sin', 'ef_hour_cos', 'ef_hour',
    'ef_month', 'monthly_avg_rainfall_prob', 
    'rain_combined_metric','fc_ef_day_diff','fc_ef_hour_diff',
    'hourly_rain_prob_change_rate'#'is_rainy_season',
]

# 데이터 로드 및 전처리 함수
def load_and_preprocess_data(db_path, table_name, column_names, thresholds, feature_names_path=None):
    print("Loading and preprocessing data from the database.")
    try:
        conn = sqlite3.connect(db_path)
        query = f"SELECT * FROM {table_name}"
        data = pd.read_sql(query, conn)
    except Exception as e:
        print(f"Error loading data from database: {e}")
        return None, None, None
    finally:
        conn.close()
    
    # 데이터가 제대로 로드되었는지 확인
    if data is None or data.empty:
        print("No data loaded from the database.")
        return None, None, None

    # 컬럼명 설정
    data.columns = column_names
    print(f"Data shape after loading: {data.shape}")
    
    # 연도 매핑
    year_mapping = {'A': 1, 'B': 2, 'C': 3}
    data['rainfall_train.fc_year'] = data['rainfall_train.fc_year'].map(year_mapping)
    data['rainfall_train.ef_year'] = data['rainfall_train.ef_year'].map(year_mapping)
    print(f"Data shape after year mapping: {data.shape}")

    # 파생 변수 생성 (월별 평균 강수량 및 강수 추세)
    data = calculate_monthly_avg_rainfall_and_trend(data)

    # 다중 임계값을 사용한 이진 변수 생성
    data = create_multithreshold_binary_features(data, thresholds)
    
    # 비 내릴 확률 측정 변수 생성
    data = create_rain_probability_feature(data, thresholds)

    # 새로운 변수 생성
    data = create_rain_nonrain_diff(data)
    
    # 비가 내릴 확률과 비가 내리지 않을 척도를 결합한 변수 생성
    data = create_combined_metric(data)

    # 시간대별 강수확률을 구합니다.
    data['hourly_rain_prob'] = data.groupby('ef_hour')['rain_probability'].transform('mean')
    
    # 시간대별 강수확률의 변화를 계산합니다.
    data['hourly_rain_prob_change'] = data['hourly_rain_prob'].diff().fillna(0)
    
    # 시간대별 강수확률의 변화율을 계산합니다.
    data['hourly_rain_prob_change_rate'] = data['hourly_rain_prob_change'] / data['hourly_rain_prob'].shift(1).fillna(1)
    # 장마철
    data['is_rainy_season'] = np.where(data['ef_month'].isin([6, 7, 8, 9]), 1, 0) 

    X = data[important_columns]
    y = data['rainfall_train.class_interval']  # 타겟 변수 설정
    print(f"Shape of X: {X.shape}, Shape of y: {y.shape}")

    # 정규화
    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

    print("Data loading and preprocessing completed.")

    # 피처 이름 저장
    if feature_names_path:
        feature_dir = os.path.dirname(feature_names_path)
        if not os.path.exists(feature_dir):
            os.makedirs(feature_dir)
        with open(feature_names_path, 'wb') as f:
            pickle.dump(important_columns, f)
    
    return X_scaled, y, data

# 데이터베이스 경로 및 테이블 이름 설정
db_path = 'C:/Users/82102/Desktop/rain/pyrain/data/qwer.db'
table_name = 'qwert'

# 열 이름 및 개수 가져오기
column_names, column_count = get_column_names_and_count(db_path, table_name)
if column_names:
    # 임계값 목록
    thresholds = [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85]

    # 데이터 로드 및 전처리
    X, y, data = load_and_preprocess_data(db_path, table_name, column_names, thresholds)

    print(f"Shape of X: {X.shape}, Shape of y: {y.shape}")
else:
    print("Failed to retrieve column names and count.")


In [None]:
전처리 과정에서 필요하다고 생각한 여러가지 파생변수를 생성하였다. 정규화진행 ,임계값부여

In [None]:
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

def calculate_vif(data):
    print("Adding constant term to the data for VIF calculation.")
    # 데이터프레임에 상수항 추가
    data = add_constant(data)
    
    print("Calculating VIF for each feature.")
    # VIF 계산
    vif = pd.DataFrame()
    vif["VIF Factor"] = [variance_inflation_factor(data.values, i) for i in range(data.shape[1])]
    vif["features"] = data.columns
    
    print("VIF calculation completed.")
    return vif

# 'data'는 이전 코드에서 생성한 데이터프레임
print("Extracting necessary data for VIF calculation.")
data_for_vif = data[important_columns]

print("Starting VIF calculation process.")
vif_data = calculate_vif(data_for_vif)

print("VIF Results:")
print(vif_data)


In [None]:
vif분석을 통하여 다중공선성 위험이 있는 변수들 vif 점수가 5점 이상인 변수들을 제거하였다

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, balanced_accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
from bayes_opt import BayesianOptimization
import pandas as pd
import numpy as np
import pickle

# 평가 함수
def evaluate_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')
    balanced_accuracy = balanced_accuracy_score(y_true, y_pred)
    conf_matrix = confusion_matrix(y_true, y_pred)
    csi, hits, false_alarms, misses = calculate_csi(y_true, y_pred)
    
    return accuracy, precision, recall, f1, balanced_accuracy, conf_matrix, csi, hits, false_alarms, misses

# CSI 계산 함수
def calculate_csi(y_true, y_pred):
    hits = 0
    false_alarms = 0
    misses = 0
    
    for true, pred in zip(y_true, y_pred):
        if true == pred and true != 0:
            hits += 1
        elif true != pred and pred != 0 and true != 0:
            false_alarms += 1
        elif true != pred and true != 0 and pred == 0:
            misses += 1
            
    csi = hits / (hits + false_alarms + misses) if (hits + false_alarms + misses) > 0 else 0
    return csi, hits, false_alarms, misses

# 데이터를 8:2로 나누기
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 가중치 적용 함수
def apply_class_weights(y_true, weight_0_to_others, weight_others_to_0, weight_others_to_others):
    weights = np.ones(y_true.shape[0])
    for i, true_class in enumerate(y_true):
        if true_class == 0:
            weights[i] = weight_0_to_others
        elif true_class != 0:
            weights[i] = weight_others_to_0 if true_class != 0 else weight_others_to_others
    return weights

# 고정된 하이퍼파라미터
n_estimators = 253
max_depth = 39
min_samples_split = 31
min_samples_leaf = 2  # Assuming a typical value for min_samples_leaf

# F1 스코어를 평가하는 함수
def evaluate_with_criteria(y_true, y_pred):
    accuracy, precision, recall, f1, balanced_accuracy, conf_matrix, csi, hits, false_alarms, misses = evaluate_model(y_true, y_pred)
    if precision >= 2 and recall >= 2:
        return f1
    else:
        return 0

# 가중치에 중점을 둔 최적화 함수
def optimize_weights(weight_0_to_others, weight_others_to_0, weight_others_to_others):
    sample_weights = apply_class_weights(y_train, weight_0_to_others, weight_others_to_0, weight_others_to_others)
    
    et_model = ExtraTreesClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    
    et_model.fit(X_train, y_train, sample_weight=sample_weights)
    y_pred = et_model.predict(X_test)
    return evaluate_with_criteria(y_test, y_pred)

# 가중치에 중점을 둔 베이지안 최적화
pbounds_weights = {
    'weight_0_to_others': (1, 6),
    'weight_others_to_0': (9, 41),
    'weight_others_to_others': (9, 31)
}

optimizer_weights = BayesianOptimization(
    f=optimize_weights,
    pbounds=pbounds_weights,
    random_state=42,
    verbose=2
)

# 최적화 수행
optimizer_weights.maximize(
    init_points=5,
    n_iter=20
)

# 최적 가중치
best_weights = optimizer_weights.max['params']
print(f"Best Weights: {best_weights}")

# 최적 가중치로 모델 학습
sample_weights = apply_class_weights(y_train, best_weights['weight_0_to_others'], best_weights['weight_others_to_0'], best_weights['weight_others_to_others'])

et_model_best = ExtraTreesClassifier(
    n_estimators=n_estimators,
    max_depth=max_depth,
    min_samples_split=min_samples_split,
    min_samples_leaf=min_samples_leaf,
    random_state=42
)
et_model_best.fit(X_train, y_train, sample_weight=sample_weights)

# 예측
y_pred_train_best = et_model_best.predict(X_train)
y_pred_test_best = et_model_best.predict(X_test)

# 평가
train_results_best = evaluate_model(y_train, y_pred_train_best)
test_results_best = evaluate_model(y_test, y_pred_test_best)

# 결과 출력
print("Train Results with Best Weights:")
print(f"Accuracy: {train_results_best[0]:.4f}")
print(f"Precision: {train_results_best[1]:.4f}")
print(f"Recall: {train_results_best[2]:.4f}")
print(f"F1 Score: {train_results_best[3]:.4f}")
print(f"Balanced Accuracy: {train_results_best[4]:.4f}")
print(f"CSI: {train_results_best[6]:.4f}")
print(f"Confusion Matrix:\n{train_results_best[5]}")

print("\nTest Results with Best Weights:")
print(f"Accuracy: {test_results_best[0]:.4f}")
print(f"Precision: {test_results_best[1]:.4f}")
print(f"Recall: {test_results_best[2]:.4f}")
print(f"F1 Score: {test_results_best[3]:.4f}")
print(f"Balanced Accuracy: {test_results_best[4]:.4f}")
print(f"CSI: {test_results_best[6]:.4f}")
print(f"Confusion Matrix:\n{test_results_best[5]}")

# 변수 중요도
importance_best = et_model_best.feature_importances_
importance_df_best = pd.DataFrame({'Feature': X_train.columns, 'Importance': importance_best}).sort_values(by='Importance', ascending=False)
print("Feature Importance:")
print(importance_df_best)

# 모델 저장
model_save_path_best = 'C:/Users/82102/Desktop/rain/pyrain/models/best_extra_trees_model_weights0.pkl'
with open(model_save_path_best, 'wb') as f:
    pickle.dump(et_model_best, f)
print(f"Best model saved to {model_save_path_best}")


In [None]:
데이터를 8대2로 분리하고
베이지안최적화는 파라미터의 어떤 지저믈 택하고 학습한 후 scoring(타겟)에 더 높은 점수를 위해 확률이 높은쪽으로 파라미터를 조정해준다??
베이지안최적화를 이용하여 트리수,깊이,샘플스플릿을 결정해두었다. 
가중치란 잘못된예측에 대해 패널티를 부여하고 모델이 예측을 더 잘할수있도록 도와주는 역할을 한다. 데이터 불균형이 심각해 계급 0으로 편향된 예측을 하는 모델을 발견하고
0_to_other, other_to_0, other_to_other 이 3개의 값을 베이지안최적화를 이용해서 다시 한번 최적화 해주었다.
여기서 중요하게 생각한 점수는 f1,precision,recall 이 3개의 값의 최대치이면서 가장 균형이는 조합을 찾는게 목표였다.
그 이유는 불균형 데이터에서의 accuracy와 csi 점수는 극단적인 scoring이라 판단하여 참고만하고 균형적인 예측을 위하여.

In [None]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler


# 데이터 전처리 및 파생 변수 생성 함수
def preprocess_and_create_features(data):
    try:
        print("Renaming columns...")
        data = data.rename(columns={
            'rainfall_test.ef_month': 'ef_month',
            'rainfall_test.fc_hour': 'fc_hour',
            'rainfall_test.ef_hour': 'ef_hour',
            'rainfall_test.fc_day': 'fc_day',
            'rainfall_test.ef_day': 'ef_day',
            'rainfall_test.fc_month': 'fc_month',
            'rainfall_test.v01': 'v01',
            'rainfall_test.v02': 'v02',
            'rainfall_test.v03': 'v03', 
            'rainfall_test.v04': 'v04',
            'rainfall_test.v05': 'v05',
            'rainfall_test.v06': 'v06', 
            'rainfall_test.v07': 'v07',
            'rainfall_test.v08': 'v08', 
            'rainfall_test.v09': 'v09',
            'rainfall_test.dh':'dh'
        })
        print("Columns renamed.")

        print("Creating derived features...")
        data['fc_ef_day_diff'] = data['ef_day'] - data['fc_day']
        data['ef_hour_sin'] = np.sin(2 * np.pi * data['ef_hour'] / 24)
        data['ef_hour_cos'] = np.cos(2 * np.pi * data['ef_hour'] / 24)
        data['fc_ef_day_ratio'] = np.where(data['ef_day'] != 0, data['fc_day'] / data['ef_day'], 0)
        data['fc_ef_hour_diff'] = data['ef_hour'] - data['fc_hour']
        data['fc_ef_hour_ratio'] = np.where(data['ef_hour'] != 0, data['fc_hour'] / data['ef_hour'], 0)
        data['fc_ef_month_diff'] = data['ef_month'] - data['fc_month']

        print("Derived features created.")
        return data
    except Exception as e:
        print(f"Error in preprocessing and creating features: {e}")
        return None

# 구간 확률 계산 함수
def calculate_segment_probabilities(data):
    try:
        print("Calculating segment probabilities...")
        segment_prob_1 = data['v01'] - data['v02']
        segment_prob_2 = data['v02'] - data['v03']
        segment_prob_3 = data['v03'] - data['v04']
        segment_prob_4 = data['v04'] - data['v05']
        segment_prob_5 = data['v05'] - data['v06']
        segment_prob_6 = data['v06'] - data['v07']
        segment_prob_7 = data['v07'] - data['v08']
        segment_prob_8 = data['v08'] - data['v09']
        segment_prob_9 = data['v09']
        
        segment_prob_df = pd.DataFrame({
            'segment_prob_1': segment_prob_1,
            'segment_prob_2': segment_prob_2,
            'segment_prob_3': segment_prob_3,
            'segment_prob_4': segment_prob_4,
            'segment_prob_5': segment_prob_5,
            'segment_prob_6': segment_prob_6,
            'segment_prob_7': segment_prob_7,
            'segment_prob_8': segment_prob_8,
            'segment_prob_9': segment_prob_9
        })
        
        segment_prob_df['segment_prob_zero'] = 100 - segment_prob_df[['segment_prob_1', 'segment_prob_2', 'segment_prob_3', 'segment_prob_4', 'segment_prob_5', 'segment_prob_6', 'segment_prob_7', 'segment_prob_8', 'segment_prob_9']].sum(axis=1)
        
        print("Segment probabilities calculated.")
        return segment_prob_df
    except Exception as e:
        print(f"Error in calculating segment probabilities: {e}")
        return None

# 월별 평균 강수량과 강수 추세 변수를 생성하는 함수
def calculate_monthly_avg_rainfall_and_trend(data):
    try:
        print("Calculating monthly average rainfall and trend...")
        # 구간 확률을 계산하여 데이터에 추가
        segment_prob_df = calculate_segment_probabilities(data)
        if segment_prob_df is None:
            return None
        data = pd.concat([data, segment_prob_df], axis=1)
        
        # 중복 열 제거
        data = data.loc[:, ~data.columns.duplicated()]
        
        # sum_segment_probs 계산
        data['sum_segment_probs'] = data[['segment_prob_1', 'segment_prob_2', 'segment_prob_3', 'segment_prob_4', 'segment_prob_5', 'segment_prob_6', 'segment_prob_7', 'segment_prob_8', 'segment_prob_9']].sum(axis=1)
        
        # 월별 평균 강수 확률 계산
        data['monthly_avg_rainfall_prob'] = data.groupby('ef_month')['sum_segment_probs'].transform('mean')
        
        print("Monthly average rainfall and trend calculated.")
        return data
    except Exception as e:
        print(f"Error in calculating monthly average rainfall and trend: {e}")
        return None

# 다중 임계값을 사용한 이진 변수 생성 함수
def create_multithreshold_binary_features(data, thresholds):
    try:
        print("Creating multi-threshold binary features...")
        for threshold in thresholds:
            data[f'is_rain_{threshold}'] = (data['sum_segment_probs'] >= threshold).astype(int)
        print("Multi-threshold binary features created.")
        return data
    except Exception as e:
        print(f"Error in creating multi-threshold binary features: {e}")
        return None

# 비 내릴 확률을 측정하는 변수를 생성하는 함수
def create_rain_probability_feature(data, thresholds):
    try:
        print("Creating rain probability feature...")
        data['rain_probability'] = data[[f'is_rain_{threshold}' for threshold in thresholds]].sum(axis=1)
        data['rain_probability_inverse'] = 1 / data['rain_probability']
        print("Rain probability feature created.")
        return data
    except Exception as e:
        print(f"Error in creating rain probability feature: {e}")
        return None

# 비가 내리지 않을 척도 변수 생성 함수
def create_rain_nonrain_diff(data):
    try:
        print("Creating rain non-rain difference...")
        diff = data['segment_prob_zero'] - data['sum_segment_probs']
        data['rain_nonrain_diff'] = np.where(diff > 0, diff, 1 / np.abs(diff))
        data['rain_nonrain_diff'] = np.where(diff == 0, 1, data['rain_nonrain_diff'])
        print("Rain non-rain difference created.")
        return data
    except Exception as e:
        print(f"Error in creating rain non-rain difference: {e}")
        return None
        
def create_combined_metric(data):
    data['rain_combined_metric'] = data['rain_probability_inverse'] * data['rain_nonrain_diff']
    return data

# 피처 정규화 함수
def normalize_features(data, features):
    try:
        print("Normalizing features...")
        scaler = StandardScaler()
        data[features] = scaler.fit_transform(data[features])
        print("Features normalized.")
        return data
    except Exception as e:
        print(f"Error in normalizing features: {e}")
        return None

# 데이터 전처리 함수
def preprocess_and_encode_test_data(data, feature_list, thresholds):
    try:
        print("Starting data preprocessing...")
        data = preprocess_and_create_features(data)
        if data is None:
            print("Failed at preprocessing and creating features.")
            return None

        print("Calculating segment probabilities...")
        segment_prob_df = calculate_segment_probabilities(data)
        if segment_prob_df is None:
            print("Failed at calculating segment probabilities.")
            return None

        # 중복 열 제거
        segment_prob_df = segment_prob_df.loc[:, ~segment_prob_df.columns.duplicated()]
        
        data = pd.concat([data, segment_prob_df], axis=1)
        
        print("Calculating monthly average rainfall and trend...")
        data = calculate_monthly_avg_rainfall_and_trend(data)
        if data is None:
            print("Failed at calculating monthly average rainfall and trend.")
            return None

        # 중복 열 제거
        data = data.loc[:, ~data.columns.duplicated()]
        
        print("Creating multi-threshold binary features...")
        data = create_multithreshold_binary_features(data, thresholds)
        if data is None:
            print("Failed at creating multi-threshold binary features.")
            return None

        # 중복 열 제거
        data = data.loc[:, ~data.columns.duplicated()]
        
        print("Creating rain probability feature...")
        data = create_rain_probability_feature(data, thresholds)
        if data is None:
            print("Failed at creating rain probability feature.")
            return None

        # 중복 열 제거
        data = data.loc[:, ~data.columns.duplicated()]
        
        print("Creating rain non-rain difference...")
        data = create_rain_nonrain_diff(data)
        if data is None:
            print("Failed at creating rain non-rain difference.")
            return None
            # 비가 내릴 확률과 비가 내리지 않을 척도를 결합한 변수 생성
        print("Creating create_combined_metric...")
        data = create_combined_metric(data)
        if data is None:
            print("Failed at creating create_combined_metric.")
            return None

        # 시간대별 강수확률을 구합니다.
        data['hourly_rain_prob'] = data.groupby('ef_hour')['rain_probability'].transform('mean')
        
        # 시간대별 강수확률의 변화를 계산합니다.
        data['hourly_rain_prob_change'] = data['hourly_rain_prob'].diff().fillna(0)
        
        # 시간대별 강수확률의 변화율을 계산합니다.
        data['hourly_rain_prob_change_rate'] = data['hourly_rain_prob_change'] / data['hourly_rain_prob'].shift(1).fillna(1)
        
        # 장마철
        data['is_rainy_season'] = np.where(data['ef_month'].isin([6, 7, 8, 9]), 1, 0) 

        print("Normalizing features...")
        features_to_normalize = [
        'segment_prob_1','segment_prob_2','segment_prob_3',
        'segment_prob_4','segment_prob_5','segment_prob_6',
        'segment_prob_7','segment_prob_8','segment_prob_9',
         'ef_hour_sin', 'ef_hour_cos', 'ef_hour',
        'ef_month', 'monthly_avg_rainfall_prob', 
        'rain_combined_metric','fc_ef_day_diff','fc_ef_hour_diff',
        'hourly_rain_prob_change_rate'
        ]
        data = normalize_features(data, features_to_normalize)
        if data is None:
            print("Failed at normalizing features.")
            return None
        
        print("Removing duplicate columns...")
        data = data.loc[:, ~data.columns.duplicated()]
        
        X_test = data[feature_list]
        
        print(f"Selected important columns for X_test with shape: {X_test.shape}")

        print("Data preprocessing completed.")
        return X_test
    except Exception as e:
        print(f"Error in preprocessing and encoding test data: {e}")
        return None

# 예측 함수 정의
def predict_with_rf(model_path, test_file_path, output_file_path):
    try:
        print(f"Loading model from {model_path}...")
        with open(model_path, 'rb') as f:
            model = pickle.load(f)
        print("Model loaded successfully.")
        
        print(f"Loading test data from {test_file_path}...")
        original_test_data = pd.read_csv(test_file_path)
        print(f"Test data shape: {original_test_data.shape}")
        
        feature_list = [
        'segment_prob_1','segment_prob_2','segment_prob_3',
        'segment_prob_4','segment_prob_5','segment_prob_6',
        'segment_prob_7','segment_prob_8','segment_prob_9',
         'ef_hour_sin', 'ef_hour_cos', 'ef_hour',
        'ef_month', 'monthly_avg_rainfall_prob', 
        'rain_combined_metric','fc_ef_day_diff','fc_ef_hour_diff',
        'hourly_rain_prob_change_rate'
        ]

        thresholds =[0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85]
        
        test_data = preprocess_and_encode_test_data(original_test_data, feature_list, thresholds)
        if test_data is None:
            print("Failed to preprocess and encode test data.")
            return
        
        print("Features used for prediction:")
        print(test_data.columns.tolist())
        
        print("Performing prediction...")
        y_pred = model.predict(test_data)
        
        print(f"Prediction completed with shape: {y_pred.shape}")

        na_indices = original_test_data['rainfall_test.class_interval'].isnull()
        original_test_data.loc[na_indices, 'rainfall_test.class_interval'] = y_pred[na_indices]

        print(f"Saving predicted results to {output_file_path}...")
        original_test_data.to_csv(output_file_path, index=False)
        print(f"Predicted results saved to {output_file_path} successfully.")
    except Exception as e:
        print(f"Error during prediction process: {e}")

# 실행 (모델 불러오기 및 예측)
model_path = 'C:/Users/82102/Desktop/rain/pyrain/models/best_extra_trees_model_bayesianf1.pkl'
test_file_path = 'C:/Users/82102/Desktop/rain/pyrain/data/rainfall_test3.csv'
output_file_path = 'C:/Users/82102/Desktop/rain/pyrain/240062.csv'

predict_with_rf(model_path, test_file_path, output_file_path)
