In [1]:
import pandas as pd
import os

# Ana veriyi oku
df = pd.read_csv("airpol_testv2.csv")  # Ana CSV osyanın adı neyse yaz

# Yeni hava durumu ortalama kolonlarını tutacak liste
weather_columns = [
    "temperature_2m", "wind_speed_10m", "wind_speed_100m",
    "wind_direction_10m", "wind_direction_100m",
    "precipitation", "relative_humidity_2m", "surface_pressure",
    "cloud_cover", "cloud_cover_low", "cloud_cover_mid", "cloud_cover_high",
    "pm10", "pm2_5", "carbon_monoxide", "nitrogen_dioxide",
    "sulphur_dioxide", "ozone", "dust", "uv_index"
]

# Her kolona boş değerle başla
for col in weather_columns:
    df[col + "_mean"] = None

# Her satır için ilgili weather CSV dosyasını oku ve ortalamaları hesapla
for idx, row in df.iterrows():
    file_id = f"id_{row['id']}.0.csv"
    file_path = os.path.join("test_weather_outputs", file_id)

    if os.path.exists(file_path):
        weather_df = pd.read_csv(file_path)

        for col in weather_columns:
            if col in weather_df.columns:
                df.at[idx, col + "_mean"] = weather_df[col].mean()
            else:
                print(f"Uyarı: {col} kolonu {file_id} içinde yok.")
    else:
        print(f"Hata: Dosya bulunamadı: {file_path}")

# Ortalamaları sayısal değerlere çevir
df[ [col + "_mean" for col in weather_columns] ] = df[ [col + "_mean" for col in weather_columns] ].astype(float)

# Sonuç
df.reset_index(drop=True, inplace=True)


In [2]:
df.head()

Unnamed: 0,id,latitude,longitude,day_of_year,day_of_week,hour,month,temperature_2m_mean,wind_speed_10m_mean,wind_speed_100m_mean,...,cloud_cover_mid_mean,cloud_cover_high_mean,pm10_mean,pm2_5_mean,carbon_monoxide_mean,nitrogen_dioxide_mean,sulphur_dioxide_mean,ozone_mean,dust_mean,uv_index_mean
0,0,-42.343,147.372,31,4,15,1,14.423543,18.19803,27.960992,...,22.66129,26.530914,5.218683,3.290726,68.145161,0.546909,0.419758,30.955645,0.012097,1.980242
1,1,37.053,127.406,28,1,12,1,-1.051409,9.170616,15.82732,...,29.323925,26.120968,30.773387,19.953629,507.086022,36.785753,23.763575,28.580645,0.505376,0.355108
2,2,49.105,-123.189,14,1,2,1,3.128764,12.208699,21.194252,...,64.142473,66.490591,9.71922,6.628763,275.232527,30.327016,6.616935,39.486559,0.0,0.07547
3,3,42.014,12.774,10,4,3,1,8.902548,7.639687,12.258187,...,28.061828,43.840054,15.063978,11.918414,230.193548,9.189247,0.805511,49.43414,0.508065,0.222312
4,4,47.778,13.002,31,4,11,1,3.937691,6.914289,11.275967,...,45.842742,49.954301,12.254973,10.309677,228.377688,16.482392,0.495296,46.439516,0.86828,0.211156


In [3]:
df.head()

Unnamed: 0,id,latitude,longitude,day_of_year,day_of_week,hour,month,temperature_2m_mean,wind_speed_10m_mean,wind_speed_100m_mean,...,cloud_cover_mid_mean,cloud_cover_high_mean,pm10_mean,pm2_5_mean,carbon_monoxide_mean,nitrogen_dioxide_mean,sulphur_dioxide_mean,ozone_mean,dust_mean,uv_index_mean
0,0,-42.343,147.372,31,4,15,1,14.423543,18.19803,27.960992,...,22.66129,26.530914,5.218683,3.290726,68.145161,0.546909,0.419758,30.955645,0.012097,1.980242
1,1,37.053,127.406,28,1,12,1,-1.051409,9.170616,15.82732,...,29.323925,26.120968,30.773387,19.953629,507.086022,36.785753,23.763575,28.580645,0.505376,0.355108
2,2,49.105,-123.189,14,1,2,1,3.128764,12.208699,21.194252,...,64.142473,66.490591,9.71922,6.628763,275.232527,30.327016,6.616935,39.486559,0.0,0.07547
3,3,42.014,12.774,10,4,3,1,8.902548,7.639687,12.258187,...,28.061828,43.840054,15.063978,11.918414,230.193548,9.189247,0.805511,49.43414,0.508065,0.222312
4,4,47.778,13.002,31,4,11,1,3.937691,6.914289,11.275967,...,45.842742,49.954301,12.254973,10.309677,228.377688,16.482392,0.495296,46.439516,0.86828,0.211156


In [4]:
df.columns

Index(['id', 'latitude', 'longitude', 'day_of_year', 'day_of_week', 'hour',
       'month', 'temperature_2m_mean', 'wind_speed_10m_mean',
       'wind_speed_100m_mean', 'wind_direction_10m_mean',
       'wind_direction_100m_mean', 'precipitation_mean',
       'relative_humidity_2m_mean', 'surface_pressure_mean',
       'cloud_cover_mean', 'cloud_cover_low_mean', 'cloud_cover_mid_mean',
       'cloud_cover_high_mean', 'pm10_mean', 'pm2_5_mean',
       'carbon_monoxide_mean', 'nitrogen_dioxide_mean', 'sulphur_dioxide_mean',
       'ozone_mean', 'dust_mean', 'uv_index_mean'],
      dtype='object')

In [5]:
import numpy as np
import pandas as pd

def cyclical_transform(df):
    # Saat (0-23)
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

    # Gün (haftalık, 0-6)
    df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)

    # Gün (yıllık, 1-365)
    df['day_of_year_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 365)
    df['day_of_year_cos'] = np.cos(2 * np.pi * df['day_of_year'] / 365)

    # Ay (1-12)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    
    return df

In [6]:
df = cyclical_transform(df)

In [7]:
df.head()

Unnamed: 0,id,latitude,longitude,day_of_year,day_of_week,hour,month,temperature_2m_mean,wind_speed_10m_mean,wind_speed_100m_mean,...,dust_mean,uv_index_mean,hour_sin,hour_cos,day_of_week_sin,day_of_week_cos,day_of_year_sin,day_of_year_cos,month_sin,month_cos
0,0,-42.343,147.372,31,4,15,1,14.423543,18.19803,27.960992,...,0.012097,1.980242,-0.7071068,-0.707107,-0.433884,-0.900969,0.508671,0.860961,0.5,0.866025
1,1,37.053,127.406,28,1,12,1,-1.051409,9.170616,15.82732,...,0.505376,0.355108,1.224647e-16,-1.0,0.781831,0.62349,0.46355,0.886071,0.5,0.866025
2,2,49.105,-123.189,14,1,2,1,3.128764,12.208699,21.194252,...,0.0,0.07547,0.5,0.866025,0.781831,0.62349,0.238673,0.9711,0.5,0.866025
3,3,42.014,12.774,10,4,3,1,8.902548,7.639687,12.258187,...,0.508065,0.222312,0.7071068,0.707107,-0.433884,-0.900969,0.171293,0.98522,0.5,0.866025
4,4,47.778,13.002,31,4,11,1,3.937691,6.914289,11.275967,...,0.86828,0.211156,0.258819,-0.965926,-0.433884,-0.900969,0.508671,0.860961,0.5,0.866025


In [8]:
def add_binary_time_features(df):
    df['is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x in [0, 6] else 0) 
    df['is_night'] = df['hour'].apply(lambda x: 1 if (x >= 22 or x < 7) else 0)
    return df

df = add_binary_time_features(df)

In [9]:
df.head()

Unnamed: 0,id,latitude,longitude,day_of_year,day_of_week,hour,month,temperature_2m_mean,wind_speed_10m_mean,wind_speed_100m_mean,...,hour_sin,hour_cos,day_of_week_sin,day_of_week_cos,day_of_year_sin,day_of_year_cos,month_sin,month_cos,is_weekend,is_night
0,0,-42.343,147.372,31,4,15,1,14.423543,18.19803,27.960992,...,-0.7071068,-0.707107,-0.433884,-0.900969,0.508671,0.860961,0.5,0.866025,0,0
1,1,37.053,127.406,28,1,12,1,-1.051409,9.170616,15.82732,...,1.224647e-16,-1.0,0.781831,0.62349,0.46355,0.886071,0.5,0.866025,0,0
2,2,49.105,-123.189,14,1,2,1,3.128764,12.208699,21.194252,...,0.5,0.866025,0.781831,0.62349,0.238673,0.9711,0.5,0.866025,0,1
3,3,42.014,12.774,10,4,3,1,8.902548,7.639687,12.258187,...,0.7071068,0.707107,-0.433884,-0.900969,0.171293,0.98522,0.5,0.866025,0,1
4,4,47.778,13.002,31,4,11,1,3.937691,6.914289,11.275967,...,0.258819,-0.965926,-0.433884,-0.900969,0.508671,0.860961,0.5,0.866025,0,0


In [10]:
df.columns

Index(['id', 'latitude', 'longitude', 'day_of_year', 'day_of_week', 'hour',
       'month', 'temperature_2m_mean', 'wind_speed_10m_mean',
       'wind_speed_100m_mean', 'wind_direction_10m_mean',
       'wind_direction_100m_mean', 'precipitation_mean',
       'relative_humidity_2m_mean', 'surface_pressure_mean',
       'cloud_cover_mean', 'cloud_cover_low_mean', 'cloud_cover_mid_mean',
       'cloud_cover_high_mean', 'pm10_mean', 'pm2_5_mean',
       'carbon_monoxide_mean', 'nitrogen_dioxide_mean', 'sulphur_dioxide_mean',
       'ozone_mean', 'dust_mean', 'uv_index_mean', 'hour_sin', 'hour_cos',
       'day_of_week_sin', 'day_of_week_cos', 'day_of_year_sin',
       'day_of_year_cos', 'month_sin', 'month_cos', 'is_weekend', 'is_night'],
      dtype='object')

In [11]:
def assign_season(df):
    def get_season(month):
        if month in [12, 1, 2]:
            return 'winter'
        elif month in [3, 4, 5]:
            return 'spring'
        elif month in [6, 7, 8]:
            return 'summer'
        else:
            return 'autumn'

    df['season'] = df['month'].apply(get_season)
    return df

In [12]:
df = assign_season(df)

df['season'] = df['season'].map({
    'winter': 0, 'spring': 1, 'summer': 2, 'autumn': 3
})

In [13]:
df.head()

Unnamed: 0,id,latitude,longitude,day_of_year,day_of_week,hour,month,temperature_2m_mean,wind_speed_10m_mean,wind_speed_100m_mean,...,hour_cos,day_of_week_sin,day_of_week_cos,day_of_year_sin,day_of_year_cos,month_sin,month_cos,is_weekend,is_night,season
0,0,-42.343,147.372,31,4,15,1,14.423543,18.19803,27.960992,...,-0.707107,-0.433884,-0.900969,0.508671,0.860961,0.5,0.866025,0,0,0
1,1,37.053,127.406,28,1,12,1,-1.051409,9.170616,15.82732,...,-1.0,0.781831,0.62349,0.46355,0.886071,0.5,0.866025,0,0,0
2,2,49.105,-123.189,14,1,2,1,3.128764,12.208699,21.194252,...,0.866025,0.781831,0.62349,0.238673,0.9711,0.5,0.866025,0,1,0
3,3,42.014,12.774,10,4,3,1,8.902548,7.639687,12.258187,...,0.707107,-0.433884,-0.900969,0.171293,0.98522,0.5,0.866025,0,1,0
4,4,47.778,13.002,31,4,11,1,3.937691,6.914289,11.275967,...,-0.965926,-0.433884,-0.900969,0.508671,0.860961,0.5,0.866025,0,0,0


In [14]:
df.columns

Index(['id', 'latitude', 'longitude', 'day_of_year', 'day_of_week', 'hour',
       'month', 'temperature_2m_mean', 'wind_speed_10m_mean',
       'wind_speed_100m_mean', 'wind_direction_10m_mean',
       'wind_direction_100m_mean', 'precipitation_mean',
       'relative_humidity_2m_mean', 'surface_pressure_mean',
       'cloud_cover_mean', 'cloud_cover_low_mean', 'cloud_cover_mid_mean',
       'cloud_cover_high_mean', 'pm10_mean', 'pm2_5_mean',
       'carbon_monoxide_mean', 'nitrogen_dioxide_mean', 'sulphur_dioxide_mean',
       'ozone_mean', 'dust_mean', 'uv_index_mean', 'hour_sin', 'hour_cos',
       'day_of_week_sin', 'day_of_week_cos', 'day_of_year_sin',
       'day_of_year_cos', 'month_sin', 'month_cos', 'is_weekend', 'is_night',
       'season'],
      dtype='object')

In [15]:
import pickle

In [16]:
coords = df[['latitude', 'longitude']].copy()
with open("kmeans_model.pkl", "rb") as f:
    kmeans = pickle.load(f)
df['cluster'] = kmeans.predict(coords)

In [17]:
df.columns

Index(['id', 'latitude', 'longitude', 'day_of_year', 'day_of_week', 'hour',
       'month', 'temperature_2m_mean', 'wind_speed_10m_mean',
       'wind_speed_100m_mean', 'wind_direction_10m_mean',
       'wind_direction_100m_mean', 'precipitation_mean',
       'relative_humidity_2m_mean', 'surface_pressure_mean',
       'cloud_cover_mean', 'cloud_cover_low_mean', 'cloud_cover_mid_mean',
       'cloud_cover_high_mean', 'pm10_mean', 'pm2_5_mean',
       'carbon_monoxide_mean', 'nitrogen_dioxide_mean', 'sulphur_dioxide_mean',
       'ozone_mean', 'dust_mean', 'uv_index_mean', 'hour_sin', 'hour_cos',
       'day_of_week_sin', 'day_of_week_cos', 'day_of_year_sin',
       'day_of_year_cos', 'month_sin', 'month_cos', 'is_weekend', 'is_night',
       'season', 'cluster'],
      dtype='object')

In [18]:
df.to_csv("nmsairpol_testv2.csv", index=False)