# **중복 데이터 제거**

In [None]:
import os
import pandas as pd

# CSV 파일이 있는 폴더 경로
folder_path = '/content/sensor_data_csv'  # 폴더 경로를 지정하세요.

# 폴더 내의 모든 파일을 순회합니다.
for filename in os.listdir(folder_path):
    # 파일이 CSV 파일인지 확인합니다.
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)

        # CSV 파일을 읽어옵니다.
        df = pd.read_csv(file_path)

        # 중복된 'sensor_id', 'timestamp', 'value'를 기준으로 중복을 제거합니다.
        df_cleaned = df.drop_duplicates(subset=['sensor_id', 'timestamp', 'value'])

        # 새 파일 이름 생성: 원래 파일 이름 + '_Remove_duplicates.csv'
        new_filename = filename.replace('.csv', '_Remove_duplicates.csv')
        new_file_path = os.path.join(folder_path, new_filename)

        # 중복이 제거된 데이터를 새로운 파일로 저장합니다.
        df_cleaned.to_csv(new_file_path, index=False)
        print(f'Processed and saved as {new_filename}')


Processed and saved as sensor_data_pm2_5_data_Remove_duplicates.csv
Processed and saved as sensor_data_humidity_data_Remove_duplicates.csv
Processed and saved as sensor_data_temperature_data_Remove_duplicates.csv
Processed and saved as sensor_data_tvoc_data_Remove_duplicates.csv
Processed and saved as sensor_data_noise_data_Remove_duplicates.csv


# **센서 값 평가 수치 매핑**

In [4]:
import pandas as pd

# CSV 파일을 읽어들입니다.
df = pd.read_csv('/content/sensor_Data_remove/sensor_data_humidity_data_Remove_duplicates.csv')

# Rating 값을 계산하는 함수
def get_rating(value):
    if value < 10 or value >90:
        return 5
    elif 10 <= value < 20 or 80 < value <= 90:
        return 4
    elif 20 <= value < 30 or 70 < value <= 80:
        return 3
    elif 30 <= value < 40 or 60 < value <= 70:
        return 2
    elif 40 <= value <= 60:
        return 1

# 'value' 열에 대해 Rating을 계산하여 새로운 열 'Rating'에 추가
df['Rating'] = df['value'].apply(get_rating)

# 결과를 새로운 CSV로 저장
df.to_csv('sensor_humidity_data_with_rating.csv', index=False)

# 결과 확인 (처리된 첫 몇 행을 출력)
print(df.head())


   id          sensor_id                   timestamp  value  Rating
0   2  0C:7B:C8:FF:5B:8F  2024-06-04 22:03:30.000000     43       1
1   3  0C:7B:C8:FF:58:8A  2024-06-04 22:03:13.000000     54       1
2   4  0C:7B:C8:FF:5B:06  2024-06-04 22:04:29.000000     43       1
3   5  0C:7B:C8:FF:56:F1  2024-06-04 22:06:40.000000     43       1
4   6  0C:7B:C8:FF:5C:C8  2024-06-04 22:09:27.000000     43       1


In [10]:
import pandas as pd

# CSV 파일을 읽어들입니다.
df = pd.read_csv('/content/sensor_Data_remove/sensor_data_temperature_data_Remove_duplicates.csv')

# Rating 값을 계산하는 함수
def get_rating(value):
    if value < 16.5 or value > 27.5:
        return 5
    elif 16.5 <= value < 17.6 or 26.4 < value <= 27.5:
        return 4
    elif 17.6 <= value < 18.7 or 25.3 < value <= 26.4:
        return 3
    elif 18.7 <= value < 19.8 or 24.2 < value <= 25.3:
        return 2
    elif 19.8 <= value <= 24.2:
        return 1

# 'value' 열에 대해 Rating을 계산하여 새로운 열 'Rating'에 추가
df['Rating'] = df['value'].apply(get_rating)

# 결과를 새로운 CSV로 저장
df.to_csv('/content/sensor_data_temperature_with_rating.csv', index=False)

# 결과 확인 (처리된 첫 몇 행을 출력)
print(df.head())


   id          sensor_id                   timestamp  value  Rating
0   2  0C:7B:C8:FF:5B:8F  2024-06-04 22:03:30.000000   25.8     3.0
1   3  0C:7B:C8:FF:58:8A  2024-06-04 22:03:03.000000   23.6     1.0
2   4  0C:7B:C8:FF:5B:06  2024-06-04 22:04:29.000000   25.8     3.0
3   5  0C:7B:C8:FF:56:F1  2024-06-04 22:06:40.000000   25.9     3.0
4   6  0C:7B:C8:FF:5C:C8  2024-06-04 22:09:17.000000   25.9     3.0


In [12]:
import pandas as pd

# CSV 파일을 읽어들입니다.
df = pd.read_csv('/content/sensor_Data_remove/sensor_data_noise_data_Remove_duplicates.csv')

# Rating 값을 계산하는 함수
def get_rating(value):
    if value > 80:
        return 5
    elif 70 < value <= 80:
        return 4
    elif 60 < value <= 70:
        return 3
    elif 50 < value <= 60:
        return 2
    elif value <= 50:
        return 1

# 'value' 열에 대해 Rating을 계산하여 새로운 열 'Rating'에 추가
df['Rating'] = df['value'].apply(get_rating)

# 결과를 새로운 CSV로 저장
df.to_csv('/content/sensor_data_noise_with_rating.csv', index=False)

# 결과 확인 (처리된 첫 몇 행을 출력)
print(df.head())


   id          sensor_id                   timestamp  value  Rating
0   2  0C:7B:C8:FF:5B:8F  2024-06-04 22:03:30.000000     38       1
1   3  0C:7B:C8:FF:58:8A  2024-06-04 22:00:13.000000     45       1
2   4  0C:7B:C8:FF:5B:06  2024-06-04 22:04:29.000000     36       1
3   5  0C:7B:C8:FF:56:F1  2024-06-04 22:06:40.000000     43       1
4   6  0C:7B:C8:FF:5C:C8  2024-06-04 22:07:17.000000     38       1


In [13]:
import pandas as pd

# CSV 파일을 읽어들입니다.
df = pd.read_csv('/content/sensor_Data_remove/sensor_data_pm2_5_data_Remove_duplicates.csv')

# Rating 값을 계산하는 함수
def get_rating(value):
    if value > 64:
        return 5
    elif 53 < value <= 64:
        return 4
    elif 41 < value <= 53:
        return 3
    elif 23 < value <= 41:
        return 2
    elif value <= 23:
        return 1

# 'value' 열에 대해 Rating을 계산하여 새로운 열 'Rating'에 추가
df['Rating'] = df['value'].apply(get_rating)

# 결과를 새로운 CSV로 저장
df.to_csv('/content/sensor_data_pm2_5_with_rating.csv', index=False)

# 결과 확인 (처리된 첫 몇 행을 출력)
print(df.head())


   id          sensor_id                   timestamp  value  Rating
0   2  0C:7B:C8:FF:5B:06  2024-06-19 17:43:48.000000     23       1
1   3  0C:7B:C8:FF:5B:06  2024-06-19 17:48:38.000000     22       1
2   4  0C:7B:C8:FF:5B:06  2024-06-19 17:53:38.000000     19       1
3   5  0C:7B:C8:FF:5B:06  2024-06-19 17:58:38.000000     19       1
4   6  0C:7B:C8:FF:5B:06  2024-06-19 18:03:38.000000     20       1


In [14]:
import pandas as pd

# CSV 파일을 읽어들입니다.
df = pd.read_csv('/content/sensor_Data_remove/sensor_data_tvoc_data_Remove_duplicates.csv')

# Rating 값을 계산하는 함수
def get_rating(value):
    if value > 10000:
        return 5
    elif 3000 < value <= 10000:
        return 4
    elif 1000 < value <= 3000:
        return 3
    elif 300 < value <= 1000:
        return 2
    elif value <= 300:
        return 1

# 'value' 열에 대해 Rating을 계산하여 새로운 열 'Rating'에 추가
df['Rating'] = df['value'].apply(get_rating)

# 결과를 새로운 CSV로 저장
df.to_csv('/content/sensor_data_tvoc_with_rating.csv', index=False)

# 결과 확인 (처리된 첫 몇 행을 출력)
print(df.head())


   id          sensor_id                   timestamp  value  Rating
0   2  0C:7B:C8:FF:5B:8F  2024-06-04 22:03:30.000000   15.0     1.0
1   3  0C:7B:C8:FF:58:8A  2024-06-04 22:03:03.000000   29.0     1.0
2   4  0C:7B:C8:FF:5B:06  2024-06-04 22:03:59.000000   15.0     1.0
3   5  0C:7B:C8:FF:56:F1  2024-06-04 22:06:10.000000   15.0     1.0
4   6  0C:7B:C8:FF:5C:C8  2024-06-04 22:09:47.000000   15.0     1.0


# **sensor ID에 따라 강의실 매핑**

In [22]:
import pandas as pd
import glob
import os

# classroom 데이터 (sensor_id -> classroom_number 매핑)
classroom_data = {
    'sensor_id': [
        '0C:7B:C8:FF:55:5D', '0C:7B:C8:FF:56:8A', '0C:7B:C8:FF:5B:8F', '0C:7B:C8:FF:5C:C8',
        '0C:7B:C8:FF:57:5A', '0C:7B:C8:FF:5B:06', '0C:7B:C8:FF:56:F1', '0C:7B:C8:FF:58:8A',
        '0C:7B:C8:FF:5A:C6', '0C:7B:C8:FF:59:C0', '0C:7B:C8:FF:5B:80', '0C:7B:C8:FF:51:78',
        'A8:46:9D:76:51:B0'
    ],
    'classroom_number': [
        6144, 6119, 5147, 5145, 4142, 3173, 3115, 4147, 3101, 1120, 1116, 3147, '누수감지센서'
    ]
}

# classroom dataframe 생성
classroom_df = pd.DataFrame(classroom_data)

# 대상 폴더 경로 (현재 디렉토리 내의 모든 .csv 확장자를 가진 파일을 대상으로 처리)
folder_path = '/content/rating_data'  # 폴더 경로를 변경해주세요
csv_files = glob.glob(os.path.join(folder_path, '*.csv'))  # 모든 .csv 파일을 처리

# 모든 CSV 파일에 대해 처리
for file in csv_files:
    # 각 CSV 파일을 읽어들입니다.
    df = pd.read_csv(file)

    # 'sensor_id'와 'classroom_number'를 기준으로 매핑
    merged_df = pd.merge(df, classroom_df[['sensor_id', 'classroom_number']], on='sensor_id', how='left')

    # sensor_id 열을 제거
    merged_df = merged_df.drop(columns=['sensor_id'])

    # classroom_number를 첫 번째 열로 이동
    cols = ['classroom_number'] + [col for col in merged_df.columns if col != 'classroom_number']
    merged_df = merged_df[cols]

    # 결과를 새로운 CSV로 저장 (원본 파일에 '_with_classroom'을 추가하여 저장)
    new_file = file.replace('.csv', '_classroom.csv')  # .csv 파일 이름 뒤에 '_with_classroom' 추가
    merged_df.to_csv(new_file, index=False)

    # 결과 확인 (파일마다 처리된 첫 몇 행 출력)
    print(f"Processed file: {new_file}")
    print(merged_df.head())


Processed file: /content/rating_data/sensor_data_noise_with_rating_classroom.csv
  classroom_number  id                   timestamp  value  Rating
0             5147   2  2024-06-04 22:03:30.000000     38       1
1             4147   3  2024-06-04 22:00:13.000000     45       1
2             3173   4  2024-06-04 22:04:29.000000     36       1
3             3115   5  2024-06-04 22:06:40.000000     43       1
4             5145   6  2024-06-04 22:07:17.000000     38       1
Processed file: /content/rating_data/sensor_data_pm2_5_with_rating_classroom.csv
  classroom_number  id                   timestamp  value  Rating
0             3173   2  2024-06-19 17:43:48.000000     23       1
1             3173   3  2024-06-19 17:48:38.000000     22       1
2             3173   4  2024-06-19 17:53:38.000000     19       1
3             3173   5  2024-06-19 17:58:38.000000     19       1
4             3173   6  2024-06-19 18:03:38.000000     20       1
Processed file: /content/rating_data/sensor_hu

In [24]:
import pandas as pd
import glob
import os

# 대상 폴더 경로 (현재 디렉토리 내의 모든 .csv 확장자를 가진 파일을 대상으로 처리)
folder_path = '/content/classroom_mappig'  # 폴더 경로를 변경해주세요
csv_files = glob.glob(os.path.join(folder_path, '*.csv'))  # 모든 .csv 파일을 처리

# 모든 CSV 파일에 대해 처리
for file in csv_files:
    # 각 CSV 파일을 읽어들입니다.
    df = pd.read_csv(file)

    # 'id' 열만 제거
    df = df.drop(columns=['id'])

    # 결과를 새로운 CSV로 저장 (원본 파일에 '_without_id'을 추가하여 저장)
    new_file = file.replace('_with_rating_classroom.csv', '_without_id.csv')  # .csv 파일 이름 뒤에 '_without_id' 추가
    df.to_csv(new_file, index=False)

    # 결과 확인 (파일마다 처리된 첫 몇 행 출력)
    print(f"Processed file: {new_file}")
    print(df.head())


Processed file: /content/classroom_mappig/sensor_data_tvoc_without_id.csv
   classroom_number                   timestamp  value  Rating
0            5147.0  2024-06-04 22:03:30.000000   15.0     1.0
1            4147.0  2024-06-04 22:03:03.000000   29.0     1.0
2            3173.0  2024-06-04 22:03:59.000000   15.0     1.0
3            3115.0  2024-06-04 22:06:10.000000   15.0     1.0
4            5145.0  2024-06-04 22:09:47.000000   15.0     1.0
Processed file: /content/classroom_mappig/sensor_data_noise_without_id.csv
   classroom_number                   timestamp  value  Rating
0              5147  2024-06-04 22:03:30.000000     38       1
1              4147  2024-06-04 22:00:13.000000     45       1
2              3173  2024-06-04 22:04:29.000000     36       1
3              3115  2024-06-04 22:06:40.000000     43       1
4              5145  2024-06-04 22:07:17.000000     38       1
Processed file: /content/classroom_mappig/sensor_data_temperature_without_id.csv
   classroom_n

# **여러 전처리 과정...**

In [28]:
import pandas as pd
import glob
import os

# 대상 폴더 경로
folder_path = '/content/without_id'
csv_files = glob.glob(os.path.join(folder_path, '*.csv'))  # 모든 .csv 파일을 처리

for file in csv_files:
    # CSV 파일 읽기
    df = pd.read_csv(file)

    # 1. 결측값 처리 (예: 제거)
    df.dropna(axis=0, inplace=True)  # 결측값이 있는 행을 삭제


    # 3. timestamp 처리 (시간, 요일 추출)
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['hour'] = df['timestamp'].dt.hour
    df['day_of_week'] = df['timestamp'].dt.dayofweek

    # 5. 'Rating'을 범주형으로 변환
    df['Rating'] = df['Rating'].astype('category')

    # 6. 처리된 데이터를 새로운 CSV로 저장
    new_file = file.replace('.csv', '_preprocessed.csv')
    df.to_csv(new_file, index=False)

    print(f"Processed file: {new_file}")
    print(df.head())


Processed file: /content/without_id/sensor_data_noise_without_id_preprocessed.csv
   classroom_number           timestamp  value Rating  hour  day_of_week
0              5147 2024-06-04 22:03:30     38      1    22            1
1              4147 2024-06-04 22:00:13     45      1    22            1
2              3173 2024-06-04 22:04:29     36      1    22            1
3              3115 2024-06-04 22:06:40     43      1    22            1
4              5145 2024-06-04 22:07:17     38      1    22            1
Processed file: /content/without_id/sensor_humidity_data_without_id_preprocessed.csv
   classroom_number           timestamp  value Rating  hour  day_of_week
0              5147 2024-06-04 22:03:30     43      1    22            1
1              4147 2024-06-04 22:03:13     54      1    22            1
2              3173 2024-06-04 22:04:29     43      1    22            1
3              3115 2024-06-04 22:06:40     43      1    22            1
4              5145 2024-06-04

# **모델 학습 및 예측 테스트**

In [65]:
import pandas as pd
import numpy as np
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# 센서 데이터 파일 경로 지정
sensor_files = {
    'noise': '/content/without_id/sensor_data_noise_without_id_preprocessed.csv',
    'pm25': '/content/without_id/sensor_data_pm2_5_without_id_preprocessed.csv',
    'temperature': '/content/without_id/sensor_data_temperature_without_id_preprocessed.csv',
    'tvoc': '/content/without_id/sensor_data_tvoc_without_id_preprocessed.csv',
    'humidity': '/content/without_id/sensor_humidity_data_without_id_preprocessed.csv'
}

# 모델 학습 및 저장 함수
def train_and_save_model(sensor_name, file_path):
    # 데이터 로드
    df = pd.read_csv(file_path)
    df['timestamp'] = pd.to_datetime(df['timestamp'])

    # 날짜와 시간 관련 특성 추출
    df['hour'] = df['timestamp'].dt.hour
    df['day_of_week'] = df['timestamp'].dt.dayofweek
    df['month'] = df['timestamp'].dt.month
    df['year'] = df['timestamp'].dt.year
    df['day_of_year'] = df['timestamp'].dt.dayofyear

    # 주기적인 시간 특성 sin/cos 변환
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)

    # 'classroom_number'를 특성에 추가
    df['classroom_number'] = df['classroom_number'].astype(str)  # 강의실 번호를 문자열로 처리

    # 목표 변수와 입력 변수 설정
    X = df.drop(columns=['Rating', 'timestamp', 'value'])
    y = df['Rating']

    # 데이터 분할 (훈련 세트, 테스트 세트)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # 모델 학습
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # RMSE 출력
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"{sensor_name.capitalize()} Sensor RMSE: {rmse}")

    # 모델 저장
    joblib.dump(model, f'{sensor_name}_model.pkl')
    print(f"{sensor_name.capitalize()} 모델이 {sensor_name}_model.pkl 파일로 저장되었습니다.")

# 각 센서 모델 학습 및 저장
if __name__ == "__main__":
    for sensor, file_path in sensor_files.items():
        train_and_save_model(sensor, file_path)


Noise Sensor RMSE: 0.21987195118722636
Noise 모델이 noise_model.pkl 파일로 저장되었습니다.
Pm25 Sensor RMSE: 0.09877616776661462
Pm25 모델이 pm25_model.pkl 파일로 저장되었습니다.
Temperature Sensor RMSE: 0.15000384076413079
Temperature 모델이 temperature_model.pkl 파일로 저장되었습니다.
Tvoc Sensor RMSE: 0.12454413771282541
Tvoc 모델이 tvoc_model.pkl 파일로 저장되었습니다.
Humidity Sensor RMSE: 0.1345200401806945
Humidity 모델이 humidity_model.pkl 파일로 저장되었습니다.


In [73]:
import pandas as pd
import numpy as np
import joblib
from datetime import datetime

# 모델 로드 함수
def load_model(sensor_name):
    model = joblib.load(f'{sensor_name}_model.pkl')
    # 모델에 사용된 특성의 순서 가져오기
    feature_order = model.feature_names_in_
    return model, feature_order

# 예측 함수
def predict_classroom_rating(date, time, model, feature_order, classroom_number):
    # 날짜와 시간 관련 특성 생성
    timestamp = pd.to_datetime(f"{date} {time}")
    features = {
        'hour': timestamp.hour,
        'day_of_week': timestamp.dayofweek,
        'month': timestamp.month,
        'year': timestamp.year,
        'day_of_year': timestamp.dayofyear,
        'hour_sin': np.sin(2 * np.pi * timestamp.hour / 24),
        'hour_cos': np.cos(2 * np.pi * timestamp.hour / 24),
        'day_of_week_sin': np.sin(2 * np.pi * timestamp.dayofweek / 7),
        'day_of_week_cos': np.cos(2 * np.pi * timestamp.dayofweek / 7),
        'classroom_number': str(classroom_number)
    }

    # 예측할 데이터프레임 생성
    X = pd.DataFrame([features])

    # 특성 순서에 맞게 정렬
    X = X[feature_order]

    # 모델 예측 수행
    return model.predict(X)[0]

# 예측을 위한 입력 데이터 설정
classroom_numbers = [1116, 1120, 3101, 3115, 3147, 3173, 4142, 4147, 5145, 5147, 6119, 6144]
current_datetime = datetime.now()
date = current_datetime.strftime('%Y-%m-%d')
time_str = current_datetime.strftime('%H:%M')

# 모델 로드 및 특성 순서 가져오기
model_noise, feature_order_noise = load_model('noise')
model_pm25, feature_order_pm25 = load_model('pm25')
model_temperature, feature_order_temperature = load_model('temperature')
model_tvoc, feature_order_tvoc = load_model('tvoc')
model_humidity, feature_order_humidity = load_model('humidity')

# 예측 수행 및 결과 저장 리스트
exceeding_classrooms = []

# 예측 수행
for classroom_number in classroom_numbers:
    noise = predict_classroom_rating(date, time_str, model_noise, feature_order_noise, classroom_number)
    pm25 = predict_classroom_rating(date, time_str, model_pm25, feature_order_pm25, classroom_number)
    temp = predict_classroom_rating(date, time_str, model_temperature, feature_order_temperature, classroom_number)
    tvoc = predict_classroom_rating(date, time_str, model_tvoc, feature_order_tvoc, classroom_number)
    humidity = predict_classroom_rating(date, time_str, model_humidity, feature_order_humidity, classroom_number)

    # 결과 출력
    print(f"강의실 {classroom_number} 예측값:")
    print(f"  소음 수준: {noise}")
    print(f"  PM2.5 농도: {pm25}")
    print(f"  온도: {temp}")
    print(f"  TVOC 농도: {tvoc}")
    print(f"  습도: {humidity}")
    print("-" * 50)

    # 값이 3을 초과하는 경우에만 추가
    exceeding_values = {}
    if noise > 3:
        exceeding_values['소음'] = noise
    if pm25 > 3:
        exceeding_values['PM2.5'] = pm25
    if temp > 3:
        exceeding_values['온도'] = temp
    if tvoc > 3:
        exceeding_values['TVOC'] = tvoc
    if humidity > 3:
        exceeding_values['습도'] = humidity

    if exceeding_values:
        exceeding_classrooms.append({
            '강의실 번호': classroom_number,
            '이상 수치': exceeding_values
        })

# 이상 수치가 발생한 강의실 출력
if exceeding_classrooms:
    print("\n이상 수치가 발생한 강의실 목록:")
    for entry in exceeding_classrooms:
        print(f"강의실 {entry['강의실 번호']}, 이상 수치: {entry['이상 수치']}")
else:
    print("이상 수치가 발생한 강의실이 없습니다.")




강의실 1116 예측값:
  소음 수준: 4.0
  PM2.5 농도: 1.0
  온도: 4.700759194601637
  TVOC 농도: 1.0
  습도: 1.8007086900204055
--------------------------------------------------
강의실 1120 예측값:
  소음 수준: 1.0
  PM2.5 농도: 1.0
  온도: 2.0
  TVOC 농도: 1.0
  습도: 1.79
--------------------------------------------------
강의실 3101 예측값:
  소음 수준: 1.087007526357372
  PM2.5 농도: 1.0
  온도: 1.0
  TVOC 농도: 1.0
  습도: 1.8129896753432393
--------------------------------------------------
강의실 3115 예측값:
  소음 수준: 2.0
  PM2.5 농도: 1.0
  온도: 3.323622735126345
  TVOC 농도: 1.0
  습도: 1.1690950573267715
--------------------------------------------------
강의실 3147 예측값:
  소음 수준: 1.0963102224572814
  PM2.5 농도: 1.0
  온도: 1.7725359469410555
  TVOC 농도: 1.0
  습도: 1.145616117562783
--------------------------------------------------
강의실 3173 예측값:
  소음 수준: 2.0
  PM2.5 농도: 1.0
  온도: 1.0
  TVOC 농도: 1.0
  습도: 1.0
--------------------------------------------------
강의실 4142 예측값:
  소음 수준: 1.0
  PM2.5 농도: 1.0
  온도: 1.468782021216593
  TVOC 농도: 1.0
  습도: 1.2308

In [74]:
import pandas as pd
import numpy as np
import joblib
from datetime import datetime
import json

# 모델 로드 함수
def load_model(sensor_name):
    model = joblib.load(f'{sensor_name}_model.pkl')
    # 모델에 사용된 특성의 순서 가져오기
    feature_order = model.feature_names_in_
    return model, feature_order

# 예측 함수
def predict_classroom_rating(date, time, model, feature_order, classroom_number):
    # 날짜와 시간 관련 특성 생성
    timestamp = pd.to_datetime(f"{date} {time}")
    features = {
        'hour': timestamp.hour,
        'day_of_week': timestamp.dayofweek,
        'month': timestamp.month,
        'year': timestamp.year,
        'day_of_year': timestamp.dayofyear,
        'hour_sin': np.sin(2 * np.pi * timestamp.hour / 24),
        'hour_cos': np.cos(2 * np.pi * timestamp.hour / 24),
        'day_of_week_sin': np.sin(2 * np.pi * timestamp.dayofweek / 7),
        'day_of_week_cos': np.cos(2 * np.pi * timestamp.dayofweek / 7),
        'classroom_number': str(classroom_number)
    }

    # 예측할 데이터프레임 생성
    X = pd.DataFrame([features])

    # 특성 순서에 맞게 정렬
    X = X[feature_order]

    # 모델 예측 수행
    return model.predict(X)[0]

# 예측을 위한 입력 데이터 설정
classroom_numbers = [1116, 1120, 3101, 3115, 3147, 3173, 4142, 4147, 5145, 5147, 6119, 6144]
current_datetime = datetime.now()
date = current_datetime.strftime('%Y-%m-%d')
time_str = current_datetime.strftime('%H:%M')

# 모델 로드 및 특성 순서 가져오기
model_noise, feature_order_noise = load_model('noise')
model_pm25, feature_order_pm25 = load_model('pm25')
model_temperature, feature_order_temperature = load_model('temperature')
model_tvoc, feature_order_tvoc = load_model('tvoc')
model_humidity, feature_order_humidity = load_model('humidity')

# 예측 수행 및 결과 저장 리스트
exceeding_classrooms = []

# 예측 수행
for classroom_number in classroom_numbers:
    noise = predict_classroom_rating(date, time_str, model_noise, feature_order_noise, classroom_number)
    pm25 = predict_classroom_rating(date, time_str, model_pm25, feature_order_pm25, classroom_number)
    temp = predict_classroom_rating(date, time_str, model_temperature, feature_order_temperature, classroom_number)
    tvoc = predict_classroom_rating(date, time_str, model_tvoc, feature_order_tvoc, classroom_number)
    humidity = predict_classroom_rating(date, time_str, model_humidity, feature_order_humidity, classroom_number)

    # 이상 수치를 넘은 값들을 저장
    exceeding_values = {}
    if noise > 3:
        exceeding_values['소음'] = noise
    if pm25 > 3:
        exceeding_values['PM2.5'] = pm25
    if temp > 3:
        exceeding_values['온도'] = temp
    if tvoc > 3:
        exceeding_values['TVOC'] = tvoc
    if humidity > 3:
        exceeding_values['습도'] = humidity

    if exceeding_values:
        # JSON 형식으로 데이터를 추가
        exceeding_classrooms.append({
            '강의실 번호': classroom_number,
            '이상 수치': exceeding_values
        })

# JSON 데이터 전송을 위한 예시 (아두이노로 보내는 형식)
if exceeding_classrooms:
    # JSON 형식으로 변환
    sensor_data = json.dumps(exceeding_classrooms, ensure_ascii=False)
    print("전송할 JSON 데이터:")
    print(sensor_data)

    # 아래는 클라이언트 소켓으로 데이터를 보내는 코드 (예시)
    # client_socket.sendall(sensor_data.encode())
else:
    print("이상 수치가 발생한 강의실이 없습니다.")


전송할 JSON 데이터:
[{"강의실 번호": 1116, "이상 수치": {"소음": 4.0, "온도": 4.700759194601637}}, {"강의실 번호": 3115, "이상 수치": {"온도": 3.323622735126345}}]


In [62]:
import pandas as pd
import numpy as np
import joblib
from datetime import datetime, timedelta

# 각 센서 모델 로드
model_noise = joblib.load('/content/noise_model.pkl')
model_pm25 = joblib.load('/content/pm25_model.pkl')
model_temperature = joblib.load('/content/temperature_model.pkl')
model_tvoc = joblib.load('/content/tvoc_model.pkl')
model_humidity = joblib.load('/content/humidity_model.pkl')

# 예측에 사용할 강의실 목록
classroom_numbers = [1116, 1120, 3101, 3115, 3147, 3173, 4142, 4147, 5145, 5147, 6119, 6144]

# 예측 함수
def predict_classroom_rating(date, time, model):
    # 날짜와 시간 관련 특성 생성
    timestamp = pd.to_datetime(f"{date} {time}")
    features = {
        'hour': timestamp.hour,
        'day_of_week': timestamp.dayofweek,
        'month': timestamp.month,
        'year': timestamp.year,
        'day_of_year': timestamp.dayofyear,
        'hour_sin': np.sin(2 * np.pi * timestamp.hour / 24),
        'hour_cos': np.cos(2 * np.pi * timestamp.hour / 24),
        'day_of_week_sin': np.sin(2 * np.pi * timestamp.dayofweek / 7),
        'day_of_week_cos': np.cos(2 * np.pi * timestamp.dayofweek / 7),
    }
    # 예측 수행
    X = pd.DataFrame([features])
    return model.predict(X)[0]

# 현재 날짜 및 시간 설정
current_time = datetime.now()
#
date = current_time.strftime("%Y-%m-%d")
one_hour_later = current_time + timedelta(hours=1)
time_str = one_hour_later.strftime("%H:%M")

# 예측 결과 출력 함수
def print_predictions(classroom_number, time_label, noise, pm25, temp, tvoc, humidity):
    print(f"\n강의실 {classroom_number} {time_label} 예측값:")
    print(f"  소음 수준: {noise}")
    print(f"  PM2.5 농도: {pm25}")
    print(f"  온도: {temp}")
    print(f"  TVOC 농도: {tvoc}")
    print(f"  습도: {humidity}")

# 각 강의실 및 1시간 뒤 예측 수행 및 출력
predictions_to_filter = []

for classroom_number in classroom_numbers:
    # 1시간 후 예측 수행
    noise = predict_classroom_rating(date, time_str, model_noise)
    pm25 = predict_classroom_rating(date, time_str, model_pm25)
    temp = predict_classroom_rating(date, time_str, model_temperature)
    tvoc = predict_classroom_rating(date, time_str, model_tvoc)
    humidity = predict_classroom_rating(date, time_str, model_humidity)

    # 예측 결과 출력
    print_predictions(classroom_number, "1시간 뒤", noise, pm25, temp, tvoc, humidity)

    # 필터링을 위해 예측 값을 딕셔너리로 저장
    predictions = {
        "소음 수준": noise,
        "PM2.5 농도": pm25,
        "온도": temp,
        "TVOC 농도": tvoc,
        "습도": humidity
    }

    # 예측 값 중 3 이상인 값만 필터링하여 저장
    filtered_predictions = {key: value for key, value in predictions.items() if value >= 3}

    if filtered_predictions:  # 필터링된 값이 있으면
        predictions_to_filter.append({
            '강의실 번호': classroom_number,
            '1시간 뒤 예측값': filtered_predictions
        })

# 필터링된 결과 출력
if predictions_to_filter:
    print("\n필터링된 예측값 (3 이상인 값들):")
    for result in predictions_to_filter:
        print(result)


KeyError: "None of [Index([DecisionTreeRegressor(max_features=1.0, random_state=1608637542),\n       DecisionTreeRegressor(max_features=1.0, random_state=1273642419),\n       DecisionTreeRegressor(max_features=1.0, random_state=1935803228),\n        DecisionTreeRegressor(max_features=1.0, random_state=787846414),\n        DecisionTreeRegressor(max_features=1.0, random_state=996406378),\n       DecisionTreeRegressor(max_features=1.0, random_state=1201263687),\n        DecisionTreeRegressor(max_features=1.0, random_state=423734972),\n        DecisionTreeRegressor(max_features=1.0, random_state=415968276),\n        DecisionTreeRegressor(max_features=1.0, random_state=670094950),\n       DecisionTreeRegressor(max_features=1.0, random_state=1914837113),\n        DecisionTreeRegressor(max_features=1.0, random_state=669991378),\n        DecisionTreeRegressor(max_features=1.0, random_state=429389014),\n        DecisionTreeRegressor(max_features=1.0, random_state=249467210),\n       DecisionTreeRegressor(max_features=1.0, random_state=1972458954),\n       DecisionTreeRegressor(max_features=1.0, random_state=1572714583),\n       DecisionTreeRegressor(max_features=1.0, random_state=1433267572),\n        DecisionTreeRegressor(max_features=1.0, random_state=434285667),\n        DecisionTreeRegressor(max_features=1.0, random_state=613608295),\n        DecisionTreeRegressor(max_features=1.0, random_state=893664919),\n        DecisionTreeRegressor(max_features=1.0, random_state=648061058),\n         DecisionTreeRegressor(max_features=1.0, random_state=88409749),\n        DecisionTreeRegressor(max_features=1.0, random_state=242285876),\n       DecisionTreeRegressor(max_features=1.0, random_state=2018247425),\n        DecisionTreeRegressor(max_features=1.0, random_state=953477463),\n       DecisionTreeRegressor(max_features=1.0, random_state=1427830251),\n       DecisionTreeRegressor(max_features=1.0, random_state=1883569565),\n        DecisionTreeRegressor(max_features=1.0, random_state=911989541),\n          DecisionTreeRegressor(max_features=1.0, random_state=3344769),\n        DecisionTreeRegressor(max_features=1.0, random_state=780932287),\n       DecisionTreeRegressor(max_features=1.0, random_state=2114032571),\n        DecisionTreeRegressor(max_features=1.0, random_state=787716372),\n        DecisionTreeRegressor(max_features=1.0, random_state=504579232),\n       DecisionTreeRegressor(max_features=1.0, random_state=1306710475),\n        DecisionTreeRegressor(max_features=1.0, random_state=479546681),\n        DecisionTreeRegressor(max_features=1.0, random_state=106328085),\n         DecisionTreeRegressor(max_features=1.0, random_state=30349564),\n       DecisionTreeRegressor(max_features=1.0, random_state=1855189739),\n         DecisionTreeRegressor(max_features=1.0, random_state=99052376),\n       DecisionTreeRegressor(max_features=1.0, random_state=1250819632),\n        DecisionTreeRegressor(max_features=1.0, random_state=106406362),\n        DecisionTreeRegressor(max_features=1.0, random_state=480404538),\n       DecisionTreeRegressor(max_features=1.0, random_state=1717389822),\n        DecisionTreeRegressor(max_features=1.0, random_state=599121577),\n        DecisionTreeRegressor(max_features=1.0, random_state=200427519),\n       DecisionTreeRegressor(max_features=1.0, random_state=1254751707),\n       DecisionTreeRegressor(max_features=1.0, random_state=2034764475),\n       DecisionTreeRegressor(max_features=1.0, random_state=1573512143),\n        DecisionTreeRegressor(max_features=1.0, random_state=999745294),\n       DecisionTreeRegressor(max_features=1.0, random_state=1958805693),\n        DecisionTreeRegressor(max_features=1.0, random_state=389151677),\n       DecisionTreeRegressor(max_features=1.0, random_state=1224821422),\n        DecisionTreeRegressor(max_features=1.0, random_state=508464061),\n        DecisionTreeRegressor(max_features=1.0, random_state=857592370),\n       DecisionTreeRegressor(max_features=1.0, random_state=1642661739),\n         DecisionTreeRegressor(max_features=1.0, random_state=61136438),\n       DecisionTreeRegressor(max_features=1.0, random_state=2075460851),\n        DecisionTreeRegressor(max_features=1.0, random_state=396917567),\n       DecisionTreeRegressor(max_features=1.0, random_state=2004731384),\n        DecisionTreeRegressor(max_features=1.0, random_state=199502978),\n       DecisionTreeRegressor(max_features=1.0, random_state=1545932260),\n        DecisionTreeRegressor(max_features=1.0, random_state=461901618),\n        DecisionTreeRegressor(max_features=1.0, random_state=774414982),\n        DecisionTreeRegressor(max_features=1.0, random_state=732395540),\n       DecisionTreeRegressor(max_features=1.0, random_state=1934879560),\n        DecisionTreeRegressor(max_features=1.0, random_state=279394470),\n         DecisionTreeRegressor(max_features=1.0, random_state=56972561),\n       DecisionTreeRegressor(max_features=1.0, random_state=1927948675),\n       DecisionTreeRegressor(max_features=1.0, random_state=1899242072),\n       DecisionTreeRegressor(max_features=1.0, random_state=1999874363),\n        DecisionTreeRegressor(max_features=1.0, random_state=271820813),\n       DecisionTreeRegressor(max_features=1.0, random_state=1324556529),\n       DecisionTreeRegressor(max_features=1.0, random_state=1655351289),\n       DecisionTreeRegressor(max_features=1.0, random_state=1308306184),\n         DecisionTreeRegressor(max_features=1.0, random_state=68574553),\n        DecisionTreeRegressor(max_features=1.0, random_state=419498548),\n        DecisionTreeRegressor(max_features=1.0, random_state=991681409),\n        DecisionTreeRegressor(max_features=1.0, random_state=791274835),\n       DecisionTreeRegressor(max_features=1.0, random_state=1035196507),\n       DecisionTreeRegressor(max_features=1.0, random_state=1890440558),\n        DecisionTreeRegressor(max_features=1.0, random_state=787110843),\n        DecisionTreeRegressor(max_features=1.0, random_state=524150214),\n        DecisionTreeRegressor(max_features=1.0, random_state=472432043),\n       DecisionTreeRegressor(max_features=1.0, random_state=2126768636),\n       DecisionTreeRegressor(max_features=1.0, random_state=1431061255),\n        DecisionTreeRegressor(max_features=1.0, random_state=147697582),\n        DecisionTreeRegressor(max_features=1.0, random_state=744595490),\n       DecisionTreeRegressor(max_features=1.0, random_state=1758017741),\n       DecisionTreeRegressor(max_features=1.0, random_state=1679592528),\n       DecisionTreeRegressor(max_features=1.0, random_state=1111451555),\n        DecisionTreeRegressor(max_features=1.0, random_state=782698033),\n        DecisionTreeRegressor(max_features=1.0, random_state=698027879),\n       DecisionTreeRegressor(max_features=1.0, random_state=1096768899),\n       DecisionTreeRegressor(max_features=1.0, random_state=1338788865),\n       DecisionTreeRegressor(max_features=1.0, random_state=1826030589),\n         DecisionTreeRegressor(max_features=1.0, random_state=86191493),\n        DecisionTreeRegressor(max_features=1.0, random_state=893102645),\n        DecisionTreeRegressor(max_features=1.0, random_state=200619113),\n        DecisionTreeRegressor(max_features=1.0, random_state=290770691),\n        DecisionTreeRegressor(max_features=1.0, random_state=793943861),\n        DecisionTreeRegressor(max_features=1.0, random_state=134489564)],\n      dtype='object')] are in the [columns]"

In [None]:
# 예측값 필터링 함수 (3 이상인 값만 필터링)
def filter_predictions(predictions):
    # 3 이상인 값들만 반환
    return {key: value for key, value in predictions.items() if value >= 3}

# 예측 결과 출력 함수
def print_predictions(classroom_number, time_label, predictions):
    filtered_predictions = filter_predictions(predictions)

    # 필터링된 예측값이 있으면 출력
    if filtered_predictions:
        print(f"\n강의실 {classroom_number} {time_label} 예측값:")
        print(filtered_predictions)

In [32]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# 데이터 로드 (CSV 파일 경로를 변경해주세요)
df = pd.read_csv('/content/without_id/sensor_data_temperature_without_id_preprocessed.csv')

# 1. 'timestamp'를 datetime 형식으로 변환
df['timestamp'] = pd.to_datetime(df['timestamp'])

# 2. 날짜와 시간 관련 특성 추출
df['hour'] = df['timestamp'].dt.hour
df['day_of_week'] = df['timestamp'].dt.dayofweek
df['month'] = df['timestamp'].dt.month
df['year'] = df['timestamp'].dt.year
df['day_of_year'] = df['timestamp'].dt.dayofyear

# 3. 주기적인 시간 특성 sin/cos 변환
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)

# 4. 목표 변수 'Rating' (혹은 예측할 수치 변수)와 입력 변수 설정
X = df.drop(columns=['Rating', 'timestamp', 'classroom_number', 'value'])  # 'Rating', 'timestamp', 'classroom_number', 'value' 제외
y = df['Rating']  # 예측할 목표 변수 (예: 'value' 혹은 'Rating')

# 5. 데이터 분할 (훈련 세트, 테스트 세트)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. 랜덤 포레스트 회귀 모델 학습
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 7. 성능 평가 (RMSE)
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse}")

# 8. 모델 예측 결과 예시
predictions = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(predictions.head())

# 9. 사용자 입력을 통한 예측
def predict_classroom_rating(classroom_number, date, time):
    # 입력된 날짜와 시간을 기반으로 예측
    timestamp = pd.to_datetime(f'{date} {time}')  # 'YYYY-MM-DD HH:MM' 형식으로 입력 받음
    hour = timestamp.hour
    day_of_week = timestamp.dayofweek
    month = timestamp.month
    year = timestamp.year
    day_of_year = timestamp.dayofyear

    # 주기적인 시간 특성 sin/cos 변환
    hour_sin = np.sin(2 * np.pi * hour / 24)
    hour_cos = np.cos(2 * np.pi * hour / 24)
    day_of_week_sin = np.sin(2 * np.pi * day_of_week / 7)
    day_of_week_cos = np.cos(2 * np.pi * day_of_week / 7)

    # 입력 특성 배열 (classroom_number와 value 제외)
    input_data = {
        'hour': hour,
        'day_of_week': day_of_week,
        'month': month,
        'year': year,
        'day_of_year': day_of_year,
        'hour_sin': hour_sin,
        'hour_cos': hour_cos,
        'day_of_week_sin': day_of_week_sin,
        'day_of_week_cos': day_of_week_cos,
    }

    # 사용자 입력값을 데이터프레임으로 변환
    input_df = pd.DataFrame([input_data])

    # 예측
    prediction = model.predict(input_df)
    return prediction[0]

# 사용자 입력 예시
classroom_number = int(input("강의실 번호를 입력하세요: "))
date = input("날짜를 입력하세요 (YYYY-MM-DD 형식): ")
time = input("시간을 입력하세요 (HH:MM 형식): ")

predicted_rating = predict_classroom_rating(classroom_number, date, time)
print(f"예측된 강의실 {classroom_number}의 수치: {predicted_rating}")


RMSE: 1.2073033432474887
        Actual  Predicted
150371     2.0   3.714932
620722     2.0   1.987520
763798     1.0   1.823174
342486     4.0   4.045406
292098     1.0   3.788502
강의실 번호를 입력하세요: 5145
날짜를 입력하세요 (YYYY-MM-DD 형식): 2024-11-15
시간을 입력하세요 (HH:MM 형식): 15:00
예측된 강의실 5145의 수치: 1.4484622136870984


In [36]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# 각 센서 데이터 로드 (CSV 파일 경로를 변경해주세요)
df_noise = pd.read_csv('/content/without_id/sensor_data_noise_without_id_preprocessed.csv')
df_pm25 = pd.read_csv('/content/without_id/sensor_data_pm2_5_without_id_preprocessed.csv')
df_temperature = pd.read_csv('/content/without_id/sensor_data_temperature_without_id_preprocessed.csv')
df_tvoc = pd.read_csv('/content/without_id/sensor_data_tvoc_without_id_preprocessed.csv')
df_humidity = pd.read_csv('/content/without_id/sensor_humidity_data_without_id_preprocessed.csv')

# 함수로 모델 학습
def train_model(df):
    # 'timestamp'를 datetime 형식으로 변환
    df['timestamp'] = pd.to_datetime(df['timestamp'])

    # 날짜와 시간 관련 특성 추출
    df['hour'] = df['timestamp'].dt.hour
    df['day_of_week'] = df['timestamp'].dt.dayofweek
    df['month'] = df['timestamp'].dt.month
    df['year'] = df['timestamp'].dt.year
    df['day_of_year'] = df['timestamp'].dt.dayofyear

    # 주기적인 시간 특성 sin/cos 변환
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)

    # 목표 변수 'Rating' (혹은 예측할 수치 변수)와 입력 변수 설정
    X = df.drop(columns=['Rating', 'timestamp', 'classroom_number', 'value'])  # 'Rating', 'timestamp', 'classroom_number', 'value' 제외
    y = df['Rating']  # 예측할 목표 변수 (예: 'value' 혹은 'Rating')

    # 데이터 분할 (훈련 세트, 테스트 세트)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # 랜덤 포레스트 회귀 모델 학습
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # 성능 평가 (RMSE)
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"RMSE: {rmse}")

    return model

# 모델 학습
model_noise = train_model(df_noise)
model_pm25 = train_model(df_pm25)
model_temperature = train_model(df_temperature)
model_tvoc = train_model(df_tvoc)
model_humidity = train_model(df_humidity)

# 사용자 입력을 통한 예측
def predict_classroom_rating(classroom_number, date, time, model):
    # 입력된 날짜와 시간을 기반으로 예측
    timestamp = pd.to_datetime(f'{date} {time}')  # 'YYYY-MM-DD HH:MM' 형식으로 입력 받음
    hour = timestamp.hour
    day_of_week = timestamp.dayofweek
    month = timestamp.month
    year = timestamp.year
    day_of_year = timestamp.dayofyear

    # 주기적인 시간 특성 sin/cos 변환
    hour_sin = np.sin(2 * np.pi * hour / 24)
    hour_cos = np.cos(2 * np.pi * hour / 24)
    day_of_week_sin = np.sin(2 * np.pi * day_of_week / 7)
    day_of_week_cos = np.cos(2 * np.pi * day_of_week / 7)

    # 입력 특성 배열 (classroom_number와 value 제외)
    input_data = {
        'hour': hour,
        'day_of_week': day_of_week,
        'month': month,
        'year': year,
        'day_of_year': day_of_year,
        'hour_sin': hour_sin,
        'hour_cos': hour_cos,
        'day_of_week_sin': day_of_week_sin,
        'day_of_week_cos': day_of_week_cos,
    }

    # 사용자 입력값을 데이터프레임으로 변환
    input_df = pd.DataFrame([input_data])

    # 예측
    prediction = model.predict(input_df)
    return prediction[0]

# 사용자 입력 예시
classroom_number = int(input("강의실 번호를 입력하세요: "))
date = input("날짜를 입력하세요 (YYYY-MM-DD 형식): ")
time = input("시간을 입력하세요 (HH:MM 형식): ")

# 각 센서 모델에 대한 예측값 출력
predicted_noise = predict_classroom_rating(classroom_number, date, time, model_noise)
predicted_pm25 = predict_classroom_rating(classroom_number, date, time, model_pm25)
predicted_temperature = predict_classroom_rating(classroom_number, date, time, model_temperature)
predicted_tvoc = predict_classroom_rating(classroom_number, date, time, model_tvoc)
predicted_humidity = predict_classroom_rating(classroom_number, date, time, model_humidity)

print(f"예측된 강의실 {classroom_number}의 센서 값:")
print(f"소음 수준: {predicted_noise}")
print(f"PM2.5 농도: {predicted_pm25}")
print(f"온도: {predicted_temperature}")
print(f"TVOC 농도: {predicted_tvoc}")
print(f"습도: {predicted_humidity}")


RMSE: 0.9758538825938903
RMSE: 0.15709890323804399
RMSE: 1.2073033432474887
RMSE: 0.2993053187539628
RMSE: 0.8897229044660403
강의실 번호를 입력하세요: 5145
날짜를 입력하세요 (YYYY-MM-DD 형식): 2025-08-10
시간을 입력하세요 (HH:MM 형식): 13:00
예측된 강의실 5145의 센서 값:
소음 수준: 1.548027017332685
PM2.5 농도: 1.0
온도: 4.047358128152161
TVOC 농도: 1.0243915910642927
습도: 2.979842456373736


In [None]:
# 사용자 입력 예시
classroom_number = int(input("강의실 번호를 입력하세요: "))
date = input("날짜를 입력하세요 (YYYY-MM-DD 형식): ")
time = input("시간을 입력하세요 (HH:MM 형식): ")

# 각 센서 모델에 대한 예측값 출력
predicted_noise = predict_classroom_rating(classroom_number, date, time, model_noise)
predicted_pm25 = predict_classroom_rating(classroom_number, date, time, model_pm25)
predicted_temperature = predict_classroom_rating(classroom_number, date, time, model_temperature)
predicted_tvoc = predict_classroom_rating(classroom_number, date, time, model_tvoc)
predicted_humidity = predict_classroom_rating(classroom_number, date, time, model_humidity)

print(f"예측된 강의실 {classroom_number}의 센서 값:")
print(f"소음 수준: {predicted_noise}")
print(f"PM2.5 농도: {predicted_pm25}")
print(f"온도: {predicted_temperature}")
print(f"TVOC 농도: {predicted_tvoc}")
print(f"습도: {predicted_humidity}")

In [38]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# 각 센서 데이터 로드 (CSV 파일 경로를 변경해주세요)
df_noise = pd.read_csv('/content/without_id/sensor_data_noise_without_id_preprocessed.csv')
df_pm25 = pd.read_csv('/content/without_id/sensor_data_pm2_5_without_id_preprocessed.csv')
df_temperature = pd.read_csv('/content/without_id/sensor_data_temperature_without_id_preprocessed.csv')
df_tvoc = pd.read_csv('/content/without_id/sensor_data_tvoc_without_id_preprocessed.csv')
df_humidity = pd.read_csv('/content/without_id/sensor_humidity_data_without_id_preprocessed.csv')

# 함수로 모델 학습
def train_model(df):
    # 'timestamp'를 datetime 형식으로 변환
    df['timestamp'] = pd.to_datetime(df['timestamp'])

    # 날짜와 시간 관련 특성 추출
    df['hour'] = df['timestamp'].dt.hour
    df['day_of_week'] = df['timestamp'].dt.dayofweek
    df['month'] = df['timestamp'].dt.month
    df['year'] = df['timestamp'].dt.year
    df['day_of_year'] = df['timestamp'].dt.dayofyear

    # 주기적인 시간 특성 sin/cos 변환
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)

    # 목표 변수 'Rating' (혹은 예측할 수치 변수)와 입력 변수 설정
    X = df.drop(columns=['Rating', 'timestamp', 'classroom_number', 'value'])  # 'Rating', 'timestamp', 'classroom_number', 'value' 제외
    y = df['Rating']  # 예측할 목표 변수 (예: 'value' 혹은 'Rating')

    # 데이터 분할 (훈련 세트, 테스트 세트)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # 랜덤 포레스트 회귀 모델 학습
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # 성능 평가 (RMSE)
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"RMSE: {rmse}")

    # 특성 중요도 추출
    feature_importance = model.feature_importances_
    feature_names = X.columns

    # 중요도와 특성 이름을 데이터프레임으로 변환
    feature_importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': feature_importance
    })

    # 중요도를 기준으로 내림차순 정렬
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

    return model, feature_importance_df

# 모델 학습 및 특성 중요도 출력
model_noise, importance_noise = train_model(df_noise)
model_pm25, importance_pm25 = train_model(df_pm25)
model_temperature, importance_temperature = train_model(df_temperature)
model_tvoc, importance_tvoc = train_model(df_tvoc)
model_humidity, importance_humidity = train_model(df_humidity)

# 사용자 입력을 통한 예측
def predict_classroom_rating(classroom_number, date, time, model):
    # 입력된 날짜와 시간을 기반으로 예측
    timestamp = pd.to_datetime(f'{date} {time}')  # 'YYYY-MM-DD HH:MM' 형식으로 입력 받음
    hour = timestamp.hour
    day_of_week = timestamp.dayofweek
    month = timestamp.month
    year = timestamp.year
    day_of_year = timestamp.dayofyear

    # 주기적인 시간 특성 sin/cos 변환
    hour_sin = np.sin(2 * np.pi * hour / 24)
    hour_cos = np.cos(2 * np.pi * hour / 24)
    day_of_week_sin = np.sin(2 * np.pi * day_of_week / 7)
    day_of_week_cos = np.cos(2 * np.pi * day_of_week / 7)

    # 입력 특성 배열 (classroom_number와 value 제외)
    input_data = {
        'hour': hour,
        'day_of_week': day_of_week,
        'month': month,
        'year': year,
        'day_of_year': day_of_year,
        'hour_sin': hour_sin,
        'hour_cos': hour_cos,
        'day_of_week_sin': day_of_week_sin,
        'day_of_week_cos': day_of_week_cos,
    }

    # 사용자 입력값을 데이터프레임으로 변환
    input_df = pd.DataFrame([input_data])

    # 예측
    prediction = model.predict(input_df)
    return prediction[0]

# 특성 중요도 출력
print("\n소음 센서 모델 특성 중요도:")
print(importance_noise)

print("\nPM2.5 센서 모델 특성 중요도:")
print(importance_pm25)

print("\n온도 센서 모델 특성 중요도:")
print(importance_temperature)

print("\nTVOC 센서 모델 특성 중요도:")
print(importance_tvoc)

print("\n습도 센서 모델 특성 중요도:")
print(importance_humidity)

# 사용자 입력 예시
classroom_number = int(input("강의실 번호를 입력하세요: "))
date = input("날짜를 입력하세요 (YYYY-MM-DD 형식): ")
time = input("시간을 입력하세요 (HH:MM 형식): ")

# 각 센서 모델에 대한 예측값 출력
predicted_noise = predict_classroom_rating(classroom_number, date, time, model_noise)
predicted_pm25 = predict_classroom_rating(classroom_number, date, time, model_pm25)
predicted_temperature = predict_classroom_rating(classroom_number, date, time, model_temperature)
predicted_tvoc = predict_classroom_rating(classroom_number, date, time, model_tvoc)
predicted_humidity = predict_classroom_rating(classroom_number, date, time, model_humidity)

print(f"예측된 강의실 {classroom_number}의 센서 값:")
print(f"소음 수준: {predicted_noise}")
print(f"PM2.5 농도: {predicted_pm25}")
print(f"온도: {predicted_temperature}")
print(f"TVOC 농도: {predicted_tvoc}")
print(f"습도: {predicted_humidity}")



RMSE: 0.9758538825938903
특성 중요도:
           Feature  Importance
4      day_of_year    0.421291
0             hour    0.180434
6         hour_cos    0.117054
5         hour_sin    0.089884
7  day_of_week_sin    0.076497
1      day_of_week    0.073837
8  day_of_week_cos    0.031722
2            month    0.009280
3             year    0.000000
RMSE: 0.15709890323804399
특성 중요도:
           Feature  Importance
4      day_of_year    0.393749
0             hour    0.250234
7  day_of_week_sin    0.113279
8  day_of_week_cos    0.065829
6         hour_cos    0.063226
5         hour_sin    0.059049
1      day_of_week    0.051982
2            month    0.002652
3             year    0.000000
RMSE: 1.2073033432474887
특성 중요도:
           Feature  Importance
4      day_of_year    0.840045
0             hour    0.056245
1      day_of_week    0.038886
7  day_of_week_sin    0.028329
6         hour_cos    0.014008
5         hour_sin    0.011626
8  day_of_week_cos    0.008447
2            month    0.002414
3

In [35]:
# 사용자 입력 예시
classroom_number = int(input("강의실 번호를 입력하세요: "))
date = input("날짜를 입력하세요 (YYYY-MM-DD 형식): ")
time = input("시간을 입력하세요 (HH:MM 형식): ")

predicted_rating = predict_classroom_rating(classroom_number, date, time)
print(f"예측된 강의실 {classroom_number}의 수치: {predicted_rating}")

강의실 번호를 입력하세요: 5145
날짜를 입력하세요 (YYYY-MM-DD 형식): 2025-08-10
시간을 입력하세요 (HH:MM 형식): 20:10
예측된 강의실 5145의 수치: 4.09933376805165
