In [3]:
# !pip install --break-system-packages xgboost

# !pip install --break-system-packages seaborn



In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np

In [5]:
url = './make_file/증폭_통합_데이터.csv'

In [6]:
data = pd.read_csv(url)

In [7]:
data.columns

Index(['중점_위도', '중점_경도', '경사각', '최근접_시설의_평균거리', '열선여부', '원본여부'], dtype='object')

In [8]:
X = data[['경사각', '최근접_시설의_평균거리']]  # 특징값
y = data['열선여부']  # 라벨값 (1 or 0)

In [9]:
len(data)

11567

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.7, random_state=42, stratify=y
)

In [11]:
# 스케일링 (표준화)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## 모델 튜닝

In [12]:
# 랜덤 포레스트 학습
rf_model = RandomForestClassifier(n_estimators=100, random_state=40)

In [13]:
rf_model.fit(X_train, y_train)

In [14]:
# 예측 및 평가
y_pred = rf_model.predict(X_test)
y_pred_list = y_pred.tolist()
print(f"랜덤 포레스트 정확도: {accuracy_score(y_test, y_pred):.4f}")

랜덤 포레스트 정확도: 0.9401


In [15]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# 정확도
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# 정밀도
precision = precision_score(y_test, y_pred)
print(f"Precision: {precision}")

# 재현율
recall = recall_score(y_test, y_pred)
print(f"Recall: {recall}")

# F1-Score
f1 = f1_score(y_test, y_pred)
print(f"F1-Score: {f1}")


Accuracy: 0.9400749063670412
Precision: 0.9549314173742652
Recall: 0.9131792629606496
F1-Score: 0.933588761174968


In [16]:
X_full = data[['경사각', '최근접_시설의_평균거리']]  # 특징값
X_full_scaled = scaler.transform(X_full)  # 스케일링 적용

In [17]:
# 전체 데이터 예측
y_full_pred = rf_model.predict(X_full_scaled)

In [18]:
# 예측 결과를 원본 데이터에 추가
data['예측_열선여부'] = y_full_pred

In [19]:
# 예측 결과 출력
print(data[['중점_위도', '중점_경도', '경사각', '최근접_시설의_평균거리', '열선여부', '예측_열선여부']].head())  # 예시로 상위 5개만 출력

       중점_위도       중점_경도        경사각  최근접_시설의_평균거리  열선여부  예측_열선여부
0  37.590543  126.993313  11.724227    219.445340   1.0      1.0
1  37.613411  126.977763   5.044559    642.349704   1.0      1.0
2  37.611366  126.975317   7.859327    420.360320   1.0      1.0
3  37.605400  126.957447  10.043141    207.818466   1.0      1.0
4  37.583314  126.987147   1.226563    250.589899   1.0      0.0


In [20]:
# 정확도 평가 (전체 데이터에 대한 예측)
accuracy_full = accuracy_score(data['열선여부'], y_full_pred)
print(f"전체 데이터에 대한 정확도: {accuracy_full:.4f}")

전체 데이터에 대한 정확도: 0.9820


In [21]:
# 예측된 열선여부와 실제 열선여부를 비교
conf_matrix = confusion_matrix(data['열선여부'], y_full_pred)
print("혼동 행렬:")
print(conf_matrix)


혼동 행렬:
[[6163   69]
 [ 139 5196]]


In [22]:
# 기타 평가 지표
precision_full = precision_score(data['열선여부'], y_full_pred)
recall_full = recall_score(data['열선여부'], y_full_pred)
f1_full = f1_score(data['열선여부'], y_full_pred)

print(f"전체 데이터 정밀도: {precision_full}")
print(f"전체 데이터 재현율: {recall_full}")
print(f"전체 데이터 F1-Score: {f1_full}")

전체 데이터 정밀도: 0.9868945868945869
전체 데이터 재현율: 0.973945641986879
전체 데이터 F1-Score: 0.9803773584905661


In [23]:
filter_data = data[data['예측_열선여부'] == 1]

In [24]:
filter_data_result = filter_data[filter_data['열선여부'] == 0]

In [25]:
len(filter_data_result)

69

In [26]:
filter_data_result.reset_index(drop=True, inplace=True)
filter_data_result = filter_data_result.drop_duplicates(subset=['중점_위도', '중점_경도', '경사각', '최근접_시설의_평균거리', '열선여부', '원본여부', '예측_열선여부'])


In [27]:
len(filter_data_result)

38

In [28]:
filter_data_result.reset_index(drop = True)

Unnamed: 0,중점_위도,중점_경도,경사각,최근접_시설의_평균거리,열선여부,원본여부,예측_열선여부
0,37.567934,127.086574,5.365783,136.280776,0.0,1,1.0
1,37.530054,127.082367,1.336699,124.841825,0.0,1,1.0
2,37.535856,127.090036,3.355348,186.830819,0.0,1,1.0
3,37.552296,127.087409,3.009963,195.349964,0.0,1,1.0
4,37.531759,127.06684,3.227776,398.506344,0.0,1,1.0
5,37.558258,127.092785,1.281472,166.63287,0.0,1,1.0
6,37.551674,127.093318,4.402925,136.938002,0.0,1,1.0
7,37.550993,127.092759,4.330537,171.995816,0.0,1,1.0
8,37.559628,127.089038,3.495281,132.188172,0.0,1,1.0
9,37.559643,127.088677,3.623036,150.408626,0.0,1,1.0


In [29]:
try :
    save_file_url = './make_file/랜덤_포레스트로_분석을_해본_결과.csv'
    data.to_csv(save_file_url, encoding="UTF-8", index=False)
    print(f"{save_file_url} 파일 저장 완료")
except OSError as e :
    print(e)

./make_file/랜덤_포레스트로_분석을_해본_결과.csv 파일 저장 완료


In [30]:
try :
    filter_data_result_url = './make_file/(최종)_랜덤_포로스트_리스트.csv'
    filter_data_result.to_csv(filter_data_result_url, encoding="UTF-8", index=False)
    print(f"{save_file_url} 파일 저장 완료")
except OSError as e :
    print(e)

./make_file/랜덤_포레스트로_분석을_해본_결과.csv 파일 저장 완료
