In [1]:
import numpy as np
import pandas as pd

In [2]:
apartments = "../datasets/apartments.csv"
apartments_df = pd.read_csv(apartments)
apartments_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21501 entries, 0 to 21500
Data columns (total 28 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    21501 non-null  object 
 1   city                  21501 non-null  object 
 2   type                  17104 non-null  object 
 3   squareMeters          21501 non-null  float64
 4   rooms                 21501 non-null  float64
 5   floor                 17928 non-null  float64
 6   floorCount            21292 non-null  float64
 7   buildYear             18121 non-null  float64
 8   latitude              21501 non-null  float64
 9   longitude             21501 non-null  float64
 10  centreDistance        21501 non-null  float64
 11  poiCount              21501 non-null  float64
 12  schoolDistance        21490 non-null  float64
 13  clinicDistance        21438 non-null  float64
 14  postOfficeDistance    21481 non-null  float64
 15  kindergartenDistanc

In [3]:
cleaned_df = apartments_df.drop(columns=["buildingMaterial", "condition"])
cleaned_df = cleaned_df[~cleaned_df.type.isnull()]
cleaned_df = cleaned_df[~cleaned_df.isna().any(axis=1)]

In [4]:
cleaned_df.shape

(11986, 26)

In [5]:
cleaned_df.id.isnull().any().sum()

0

In [6]:
for col in cleaned_df.columns:
    print(f"{col}: {cleaned_df[col].isnull().sum()}")

id: 0
city: 0
type: 0
squareMeters: 0
rooms: 0
floor: 0
floorCount: 0
buildYear: 0
latitude: 0
longitude: 0
centreDistance: 0
poiCount: 0
schoolDistance: 0
clinicDistance: 0
postOfficeDistance: 0
kindergartenDistance: 0
restaurantDistance: 0
collegeDistance: 0
pharmacyDistance: 0
ownership: 0
hasParkingSpace: 0
hasBalcony: 0
hasElevator: 0
hasSecurity: 0
hasStorageRoom: 0
price: 0


In [7]:
cleaned_df.to_csv("../datasets/cleaned_apartments.csv", index=False)

In [8]:
# %% 1) Импорты и определения функций
import pandas as pd
import numpy as np

def assign_split(uuid: str, train_ids, test_ids, val_ids) -> str:
    """
    Возвращает 'train', 'test', 'val' или 'unknown'
    в зависимости от того, в каком списке находится uuid.
    """
    if uuid in train_ids:
        return "train"
    elif uuid in test_ids:
        return "test"
    elif uuid in val_ids:
        return "val"
    else:
        return "unknown"

def process_row_final(row: pd.Series, normalize: bool = True) -> pd.Series:
    """
    Обрабатывает одну строку данных об апартаментах:
     - бинарные флаги yes/no → bool
     - дефолты для числовых пропусков
     - категории → числовые признаки
     - новые признаки (price_per_sqm, log_price, is_high_floor и др.)
     - дроп ненужных колонок
     - (опционально) нормализация
    Сохраняет исходные id → uuid и split.
    """
    # Сохраняем id и split
    uuid  = row.get("id")
    split = row.get("split", "unknown")

    p = row.copy()

    # 1) Бинарные yes/no → True/False
    bin_cols = ['hasParkingSpace','hasBalcony','hasElevator','hasSecurity','hasStorageRoom']
    for c in bin_cols:
        val = p.get(c, None)
        p[c] = True if str(val).lower() == 'yes' else False

    # 2) Дефолты для чисел
    defaults = {
        'floor': 2,
        'floorCount': 5,
        'squareMeters': 50,
        'rooms': 2,
        'centreDistance': 5.0,
        'poiCount': 10
    }
    for c, d in defaults.items():
        if pd.isna(p.get(c, np.nan)):
            p[c] = d

    # 3) Категории → числа
    type_map = {'blockOfFlats':0, 'tenement':1, 'apartmentBuilding':2}
    p['type_numeric'] = type_map.get(p.get('type'), 3)

    cond_map = {'very good':4, 'good':3, 'average':2, 'poor':1, 'to renovation':0}
    p['condition_numeric'] = cond_map.get(p.get('condition'), 2)

    city_map = {
        'warszawa':0, 'krakow':1, 'wroclaw':2,
        'gdansk':3,   'lodz':4,   'poznan':5
    }
    city = str(p.get('city','')).lower()
    p['city_numeric'] = city_map.get(city, 6)

    # 4) Новые признаки
    p['floor_ratio']     = p['floor'] / max(p['floorCount'], 1)
    p['price_per_sqm']   = p['price'] / max(p['squareMeters'], 0.1)
    p['is_high_floor']   = int(p['floor'] >= 0.75 * p['floorCount'])
    p['log_price']       = np.log1p(p['price'])
    p['comfort_score']   = sum(int(p[f]) for f in bin_cols)

    # 5) Дропим все ненужные колонки
    drop_cols = [
        'id', 'split',
        'buildYear', 'buildingMaterial', 'ownership',
        'schoolDistance', 'clinicDistance', 'kindergartenDistance',
        'restaurantDistance', 'collegeDistance', 'pharmacyDistance', 'postOfficeDistance',
        'type', 'condition', 'city'
    ]
    p = p.drop(drop_cols, errors='ignore')

    # 6) Нормализация (если нужно)
    if normalize:
        ranges = {
            'squareMeters':(20,200),'rooms':(1,6),'floor':(0,20),'floorCount':(1,30),
            'centreDistance':(0,20),'poiCount':(0,50),
            'type_numeric':(0,3),'condition_numeric':(0,4),'city_numeric':(0,6),
            'floor_ratio':(0,1),'price_per_sqm':(20,500),'comfort_score':(0,5)
        }
        for col,(mn,mx) in ranges.items():
            if col in p:
                v = p[col]
                v_clipped = max(min(v, mx), mn)
                p[col] = (v_clipped - mn) / (mx - mn)

    # Финальный Series с uuid и split
    out = p.copy()
    out['uuid']  = uuid
    out['split'] = split
    return out


In [9]:
# %% 2) Чтение исходных данных и назначение split

df_raw = pd.read_csv('../datasets/cleaned_apartments.csv')

train_ids = pd.read_csv('../datasets/train.csv')['id'].astype(str).tolist()
test_ids  = pd.read_csv('../datasets/test.csv')['id'].astype(str).tolist()
val_ids   = pd.read_csv('../datasets/val.csv')['id'].astype(str).tolist()

df_raw['split'] = df_raw['id'].astype(str).apply(
    lambda u: assign_split(u, train_ids, test_ids, val_ids)
)


In [10]:
# %% 3) Применяем процессинг ко всему DataFrame и сохраняем

df_processed = df_raw.apply(
    lambda r: process_row_final(r, normalize=True),
    axis=1
)

df_processed.to_csv('../datasets/processed_cleaned_apartments.csv', index=False)
print("processed_cleaned_apartments.csv ready")


processed_cleaned_apartments.csv ready
