In [1]:
# !pip install --break-system-packages xgboost

# !pip install --break-system-packages seaborn



In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np

In [3]:
url = './make_file/증폭_종합_데이터.csv'

In [4]:
data = pd.read_csv(url)

In [5]:
data.columns

Index(['위도', '경도', '경사각', '평균_거리차', '열선여부', '원본여부'], dtype='object')

In [6]:
X = data[['위도', '경도', '경사각', '평균_거리차']]  # 특징값
y = data['열선여부']  # 라벨값 (1 or 0)

In [7]:
len(data)

6478

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.7, random_state=42, stratify=y
)

In [9]:
# 스케일링 (표준화)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
# 랜덤 포레스트 학습
rf_model = RandomForestClassifier(n_estimators=5, random_state=42)

In [11]:
rf_model.fit(X_train, y_train)

In [12]:
# 예측 및 평가
y_pred = rf_model.predict(X_test)
y_pred_list = y_pred.tolist()
print(f"랜덤 포레스트 정확도: {accuracy_score(y_test, y_pred):.4f}")

랜덤 포레스트 정확도: 0.9686


In [13]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# 정확도
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# 정밀도
precision = precision_score(y_test, y_pred)
print(f"Precision: {precision}")

# 재현율
recall = recall_score(y_test, y_pred)
print(f"Recall: {recall}")

# F1-Score
f1 = f1_score(y_test, y_pred)
print(f"F1-Score: {f1}")


Accuracy: 0.9686213991769548
Precision: 0.6065573770491803
Recall: 0.5
F1-Score: 0.5481481481481482


In [14]:
X_full = data[['위도', '경도', '경사각', '평균_거리차']]  # 특징값
X_full_scaled = scaler.transform(X_full)  # 스케일링 적용

In [15]:
# 전체 데이터 예측
y_full_pred = rf_model.predict(X_full_scaled)

In [16]:
# 예측 결과를 원본 데이터에 추가
data['예측_열선여부'] = y_full_pred

In [17]:
# 예측 결과 출력
print(data[['위도', '경도', '경사각', '평균_거리차', '열선여부', '예측_열선여부']].head())  # 예시로 상위 5개만 출력

          위도          경도       경사각      평균_거리차  열선여부  예측_열선여부
0  37.553039  127.097008  7.851888  184.137070   1.0      1.0
1  37.553626  127.097029  5.862993  184.374022   1.0      1.0
2  37.567109  127.088453  0.000000  129.476463   1.0      1.0
3  37.570235  127.085884  5.678818  292.900094   1.0      0.0
4  37.551073  127.110615  0.000000  748.250385   1.0      1.0


In [18]:
# 정확도 평가 (전체 데이터에 대한 예측)
accuracy_full = accuracy_score(data['열선여부'], y_full_pred)
print(f"전체 데이터에 대한 정확도: {accuracy_full:.4f}")

전체 데이터에 대한 정확도: 0.9870


In [19]:
# 예측된 열선여부와 실제 열선여부를 비교
conf_matrix = confusion_matrix(data['열선여부'], y_full_pred)
print("혼동 행렬:")
print(conf_matrix)


혼동 행렬:
[[6199   33]
 [  51  195]]


In [20]:
# 기타 평가 지표
precision_full = precision_score(data['열선여부'], y_full_pred)
recall_full = recall_score(data['열선여부'], y_full_pred)
f1_full = f1_score(data['열선여부'], y_full_pred)

print(f"전체 데이터 정밀도: {precision_full}")
print(f"전체 데이터 재현율: {recall_full}")
print(f"전체 데이터 F1-Score: {f1_full}")

전체 데이터 정밀도: 0.8552631578947368
전체 데이터 재현율: 0.7926829268292683
전체 데이터 F1-Score: 0.8227848101265823


In [21]:
filter_data = data[data['예측_열선여부'] == 1]

In [22]:
filter_data_result = filter_data[filter_data['열선여부'] == 0]

In [23]:
len(filter_data_result)

33

In [24]:
filter_data_result.reset_index(drop=True, inplace=True)
filter_data_result = filter_data_result.drop_duplicates(subset=['위도', '경도', '경사각', '평균_거리차', '열선여부', '원본여부', '예측_열선여부'])


In [25]:
len(filter_data_result)

17

In [26]:
filter_data_result.reset_index(drop = True)

Unnamed: 0,위도,경도,경사각,평균_거리차,열선여부,원본여부,예측_열선여부
0,37.531759,127.06684,3.227776,398.506344,0.0,1,1.0
1,37.57007,127.088228,11.565654,338.103674,0.0,1,1.0
2,37.569346,127.088355,9.995126,296.551183,0.0,1,1.0
3,37.570063,127.087623,9.526404,314.444234,0.0,1,1.0
4,37.570814,127.087008,19.886566,369.225618,0.0,1,1.0
5,37.570071,127.087019,9.90958,298.017971,0.0,1,1.0
6,37.549348,127.109951,1.799009,591.910426,0.0,1,1.0
7,37.551337,127.111063,0.113643,796.174498,0.0,1,1.0
8,37.551111,127.110822,4.872838,764.443461,0.0,1,1.0
9,37.55128,127.110261,4.076612,716.738705,0.0,1,1.0


In [27]:
save_file_url = './make_file/랜덤_포레스트로_분석을_해본_결과.scv'

In [28]:
try :
    data.to_csv(save_file_url)
    print(f"{save_file_url} 파일 저장 완료")
except OSError as e :
    print(e)

./make_file/랜덤_포레스트로_분석을_해본_결과.scv 파일 저장 완료


In [29]:
filter_data_result_url = './make_file/(최종)_랜덤_포로스트_리스트.scv'

In [30]:
try :
    filter_data_result.to_csv(filter_data_result_url)
    print(f"{save_file_url} 파일 저장 완료")
except OSError as e :
    print(e)

./make_file/랜덤_포레스트로_분석을_해본_결과.scv 파일 저장 완료
