In [39]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import torch
from torch.utils.data import Dataset, DataLoader, Subset
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
import tqdm
from tqdm import tqdm

In [40]:
dtrain = pd.read_csv('upd_train.csv')

In [41]:
data = dtrain

# Преобразование 'no' в 0, 'yes' в 1 для указанных столбцов
columns_to_convert = ['thermal_power_plant_raion', 'incineration_raion', 'oil_chemistry_raion',
                      'radiation_raion', 'railroad_terminal_raion', 'big_market_raion',
                      'nuclear_reactor_raion', 'detention_facility_raion', 'culture_objects_top_25',
                      'water_1line', 'big_road1_1line', 'railroad_1line']

for column in columns_to_convert:
    data[column] = data[column].apply(lambda x: 1 if x == 'yes' else 0)


In [42]:
data.head()

Unnamed: 0,id,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,...,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000,price_doc,month,year
0,8059,2013-05-21,11,11.0,2.0,5.0,2.0,1907,1.0,12.0,...,26,133,207,1,89,161,10,2750000,5,1907
1,8138,2013-05-25,53,30.0,10.0,16.0,1.0,1980,2.0,8.0,...,0,11,21,1,0,46,5,9000000,5,1980
2,8156,2013-05-27,77,41.0,2.0,17.0,6.0,2014,3.0,12.0,...,0,1,7,1,0,12,1,7011550,5,2014
3,8157,2013-05-27,45,27.0,6.0,9.0,1.0,1970,2.0,6.0,...,0,3,8,1,0,19,3,7100000,5,1970
4,8178,2013-05-28,38,20.0,15.0,16.0,1.0,1982,1.0,8.0,...,1,11,25,1,7,95,4,6450000,5,1982


In [43]:
data['build_year'].head()

0    1907
1    1980
2    2014
3    1970
4    1982
Name: build_year, dtype: int64

In [44]:
# Предположим, что ваш датасет называется data
nan_count = data['build_year'].isna().sum()
print("Количество NaN в столбце build_year:", nan_count)
print("Количество записей в столбце:", len(data['build_year']))


Количество NaN в столбце build_year: 0
Количество записей в столбце: 16866


In [45]:

data = data.dropna(subset=['build_year'])

In [46]:
data['build_year'] = data['build_year'].astype(int)

In [47]:
data['build_year']

0        1907
1        1980
2        2014
3        1970
4        1982
         ... 
16861    2017
16862    1975
16863    1935
16864    2003
16865    1968
Name: build_year, Length: 16866, dtype: int32

In [48]:
target = data['price_doc']
data['year'] = data['build_year']
important_cols = ['material', 'num_room', 'sub_area', 'year', 'full_sq', 'max_floor', 'build_year', 'school_km', 'kremlin_km', 'floor', 'cafe_avg_price_5000']
num_cat = ['material', 'num_room']
cat = num_cat + ['sub_area']
df = data[important_cols]

In [49]:
data.to_csv('upd_train.csv', index=None)

In [50]:
print(data.columns)


Index(['id', 'timestamp', 'full_sq', 'life_sq', 'floor', 'max_floor',
       'material', 'build_year', 'num_room', 'kitch_sq',
       ...
       'cafe_count_5000_price_high', 'big_church_count_5000',
       'church_count_5000', 'mosque_count_5000', 'leisure_count_5000',
       'sport_count_5000', 'market_count_5000', 'price_doc', 'month', 'year'],
      dtype='object', length=294)


In [51]:
# Класс для создания кастомного датасета
class CustomDataset(Dataset):
    def __init__(self, dataframe, numeric_cols, categorical_cols, target_col):
        self.numeric_cols = numeric_cols
        self.categorical_cols = categorical_cols
        self.target_col = target_col
        
        # Используем SimpleImputer для заполнения пропущенных значений в числовых столбцах
        self.numeric_imputer = SimpleImputer(strategy='mean')
        dataframe.loc[:, self.numeric_cols] = self.numeric_imputer.fit_transform(dataframe.loc[:, self.numeric_cols])
        
        # Используем LabelEncoder для кодирования категориальных столбцов
        self.label_encoders = {}
        for col in self.categorical_cols:
            label_encoder = LabelEncoder()
            dataframe.loc[:, col] = label_encoder.fit_transform(dataframe.loc[:, col])
            self.label_encoders[col] = label_encoder
        
        # Масштабирование числовых признаков
        self.scaler = StandardScaler()
        dataframe.loc[:, self.numeric_cols] = self.scaler.fit_transform(dataframe.loc[:, self.numeric_cols])
        
        # Разделение данных на признаки и целевую переменную
        self.X = dataframe.drop(columns=[self.target_col])
        self.y = dataframe[self.target_col]
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        # Получение данных для элемента с индексом idx
        features = self.X.iloc[idx].values.astype('float32')
        target = self.y.iloc[idx]
        return torch.tensor(features), torch.tensor(target)

In [52]:
data = pd.read_csv('upd_train.csv')

# Выбор важных столбцов
important_cols = ['material', 'num_room', 'sub_area', 'year', 'full_sq', 'max_floor', 
                  'build_year', 'school_km', 'kremlin_km', 'floor', 'cafe_avg_price_5000', 'price_doc']
numeric_cols = ['num_room', 'year', 'full_sq', 'max_floor', 'build_year', 'school_km', 'kremlin_km', 'floor', 'cafe_avg_price_5000']
cat_cols = ['material', 'sub_area']



# Создание кастомного датасета
custom_dataset = CustomDataset(dataframe=data[important_cols], 
                               numeric_cols=numeric_cols, 
                               categorical_cols=cat_cols, 
                               target_col='price_doc')

# Создание даталоадера
batch_size = 32
shuffle = True
data_loader = DataLoader(dataset=custom_dataset, batch_size=batch_size, shuffle=shuffle)


  dataframe.loc[:, self.numeric_cols] = self.scaler.fit_transform(dataframe.loc[:, self.numeric_cols])
 -0.48947706]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  dataframe.loc[:, self.numeric_cols] = self.scaler.fit_transform(dataframe.loc[:, self.numeric_cols])
  dataframe.loc[:, self.numeric_cols] = self.scaler.fit_transform(dataframe.loc[:, self.numeric_cols])


In [53]:
def split_dataset(dataset, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1, shuffle=True):
    dataset_size = len(dataset)
    indices = list(range(dataset_size))
    if shuffle:
        np.random.shuffle(indices)
        
    train_split = int(np.floor(train_ratio * dataset_size))
    val_split = int(np.floor(val_ratio * dataset_size)) + train_split
    
    train_indices = indices[:train_split]
    val_indices = indices[train_split:val_split]
    test_indices = indices[val_split:]
    
    train_dataset = Subset(dataset, train_indices)
    val_dataset = Subset(dataset, val_indices)
    test_dataset = Subset(dataset, test_indices)
    
    return train_dataset, val_dataset, test_dataset

# Разделение датасета на тренировочную, валидационную и тестовую выборки
train_dataset, val_dataset, test_dataset = split_dataset(custom_dataset)

# Создание DataLoader для каждого датасета
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [54]:
# Функция для одной эпохи обучения
def train_epoch(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0.0
    total_mae = 0.0
    total_mape = 0.0
    
    for inputs, targets in tqdm(train_loader, desc="Training"):
        inputs, targets = inputs.to(device), targets.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        
        loss = criterion(outputs.squeeze(), targets.float())
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        total_mae += mean_absolute_error(targets.cpu().numpy(), outputs.detach().cpu().numpy())
        total_mape += mean_absolute_percentage_error(targets.cpu().numpy(), outputs.detach().cpu().numpy())
    
    mean_loss = total_loss / len(train_loader)
    mean_mae = total_mae / len(train_loader)
    mean_mape = total_mape / len(train_loader)
    
    return mean_loss, mean_mae, mean_mape

In [55]:
# Функция для оценки модели на валидационном датасете
def evaluate_model(model, val_loader, criterion, device):
    model.eval()
    total_loss = 0.0
    total_mae = 0.0
    total_mape = 0.0
    
    with torch.no_grad():
        for inputs, targets in tqdm(val_loader, desc="Validation"):
            inputs, targets = inputs.to(device), targets.to(device)
            
            outputs = model(inputs)
            
            loss = criterion(outputs.squeeze(), targets.float())
            total_loss += loss.item()
            total_mae += mean_absolute_error(targets.cpu().numpy(), outputs.cpu().numpy())
            total_mape += mean_absolute_percentage_error(targets.cpu().numpy(), outputs.cpu().numpy())
    
    mean_loss = total_loss / len(val_loader)
    mean_mae = total_mae / len(val_loader)
    mean_mape = total_mape / len(val_loader)
    
    return mean_loss, mean_mae, mean_mape

In [56]:
class CustomSequentialModel(nn.Module):
    def __init__(self, input_dim):
        super(CustomSequentialModel, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.Dropout(0.3),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.Dropout(0.3),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.Dropout(0.3),
            nn.ReLU(),
            nn.Linear(128, 1)
        )
        
    def forward(self, x):
        return self.model(x)
    
    def predict(self, numeric_data, categorical_data, cat_encoders, scaler):
        numeric_data = scaler.transform(numeric_data)
        categorical_data = [encoder.transform([val]) for encoder, val in zip(cat_encoders, categorical_data)]
        numeric_tensor = torch.tensor(numeric_data, dtype=torch.float32).unsqueeze(0)
        categorical_tensor = torch.tensor(categorical_data, dtype=torch.int64)
        with torch.no_grad():
            output = self(numeric_tensor, categorical_tensor)
        return output.item()

In [57]:
# Инициализация модели и отправка на GPU
input_dim = len(numeric_cols) + len(cat_cols)
model = CustomSequentialModel(input_dim).to('cuda')

# Определение функции потерь и оптимизатора
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [58]:
def split_dataset(dataset, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1, shuffle=True):
    dataset_size = len(dataset)
    indices = list(range(dataset_size))
    if shuffle:
        np.random.shuffle(indices)
        
    train_split = int(np.floor(train_ratio * dataset_size))
    val_split = int(np.floor(val_ratio * dataset_size)) + train_split
    
    train_indices = indices[:train_split]
    val_indices = indices[train_split:val_split]
    test_indices = indices[val_split:]
    
    train_dataset = Subset(dataset, train_indices)
    val_dataset = Subset(dataset, val_indices)
    test_dataset = Subset(dataset, test_indices)
    
    return train_dataset, val_dataset, test_dataset

# Разделение датасета на тренировочную, валидационную и тестовую выборки
train_dataset, val_dataset, test_dataset = split_dataset(custom_dataset)

# Создание DataLoader для каждого датасета
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [59]:
def train_model(model, train_loader, val_loader, test_loader, optimizer, criterion, device, num_epochs):
    for epoch in range(num_epochs):
        model.train()  # set model to training mode
        train_loss, train_mae, train_mape = 0.0, 0.0, 0.0
        
        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}', leave=False)
        for inputs, targets in progress_bar:
            inputs = inputs.to(device)
            targets = targets.to(device)
            
            optimizer.zero_grad()  # zero the parameter gradients
            
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), targets.float())
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item() * inputs.size(0)
            abs_errors = torch.abs(outputs.squeeze() - targets.float())
            train_mae += torch.sum(abs_errors).item()
            train_mape += torch.sum(abs_errors / targets.float()).item()
            
            progress_bar.set_postfix({'loss': train_loss / len(train_loader.dataset), 
                                      'MAE': train_mae / len(train_loader.dataset),
                                      'MAPE': train_mape / len(train_loader.dataset)})
        
        # Evaluation on validation dataset
        val_loss, val_mae, val_mape = evaluate_model(model, val_loader, criterion, device)
        
        # Evaluation on test dataset
        test_loss, test_mae, test_mape = evaluate_model(model, test_loader, criterion, device)
        
        print(f"Train Loss: {train_loss / len(train_loader.dataset):.4f} | Train MAE: {train_mae / len(train_loader.dataset):.4f} | Train MAPE: {train_mape / len(train_loader.dataset):.4f}")
        print(f"Val Loss: {val_loss:.4f} | Val MAE: {val_mae:.4f} | Val MAPE: {val_mape:.4f}")
        print(f"Test Loss: {test_loss:.4f} | Test MAE: {test_mae:.4f} | Test MAPE: {test_mape:.4f}")

def evaluate_model(model, dataloader, criterion, device):
    model.eval()  # set model to evaluation mode
    total_loss, total_mae, total_mape = 0.0, 0.0, 0.0
    with torch.no_grad():
        for inputs, targets in dataloader:
            inputs = inputs.to(device)
            targets = targets.to(device)
            
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), targets.float())
            total_loss += loss.item() * inputs.size(0)
            abs_errors = torch.abs(outputs.squeeze() - targets.float())
            total_mae += torch.sum(abs_errors).item()
            total_mape += torch.sum(abs_errors / targets.float()).item()
    
    avg_loss = total_loss / len(dataloader.dataset)
    avg_mae = total_mae / len(dataloader.dataset)
    avg_mape = total_mape / len(dataloader.dataset)
    return avg_loss, avg_mae, avg_mape


In [60]:
# Параметры обучения
num_epochs = 12


# Обучение модели
train_model(model, train_loader, val_loader, test_loader, optimizer, criterion, 'cuda', num_epochs)

                                                                                                      

Train Loss: 55722201931895.0078 | Train MAE: 5626682.3398 | Train MAPE: 0.8204
Val Loss: 36499227005865.7578 | Val MAE: 4268333.0202 | Val MAPE: 0.7255
Test Loss: 39236911674877.5703 | Test MAE: 4331097.2464 | Test MAPE: 0.7366


                                                                                                      

Train Loss: 35854846074906.7188 | Train MAE: 4254212.0795 | Train MAPE: 0.7541
Val Loss: 30336418086173.4570 | Val MAE: 3863601.7556 | Val MAPE: 0.6730
Test Loss: 32886788990675.1094 | Test MAE: 3931919.1659 | Test MAPE: 0.6950


                                                                                                      

Train Loss: 26560096185951.4766 | Train MAE: 3638976.1714 | Train MAPE: 0.6965
Val Loss: 19124167723000.7109 | Val MAE: 3112226.6050 | Val MAPE: 0.6428
Test Loss: 22061347496673.6680 | Test MAE: 3270148.2844 | Test MAPE: 0.7092


                                                                                                      

Train Loss: 18332288926342.0352 | Train MAE: 2979199.9265 | Train MAPE: 0.6424
Val Loss: 13959035653041.0430 | Val MAE: 2625643.5421 | Val MAPE: 0.5721
Test Loss: 17168890543584.4551 | Test MAE: 2793040.4242 | Test MAPE: 0.6519


                                                                                                      

Train Loss: 16188005964097.1953 | Train MAE: 2742952.7314 | Train MAPE: 0.6192
Val Loss: 12849647242880.1523 | Val MAE: 2491917.0463 | Val MAPE: 0.5428
Test Loss: 15960558553291.8301 | Test MAE: 2639212.9763 | Test MAPE: 0.6183


                                                                                                      

Train Loss: 15552496644485.8066 | Train MAE: 2627674.2289 | Train MAPE: 0.6162
Val Loss: 12364050531781.0859 | Val MAE: 2389875.7319 | Val MAPE: 0.5319
Test Loss: 15455217889260.5879 | Test MAE: 2542018.4123 | Test MAPE: 0.6077


                                                                                                      

Train Loss: 15086228574046.4922 | Train MAE: 2536996.3673 | Train MAPE: 0.6014
Val Loss: 11772485977716.0039 | Val MAE: 2297575.8102 | Val MAPE: 0.5257
Test Loss: 14933015119299.3359 | Test MAE: 2455753.8341 | Test MAPE: 0.6053


                                                                                                      

Train Loss: 14561121673806.1738 | Train MAE: 2467904.6223 | Train MAPE: 0.5957
Val Loss: 11380862259651.8711 | Val MAE: 2216652.0712 | Val MAPE: 0.5238
Test Loss: 14639661253539.7910 | Test MAE: 2384072.5474 | Test MAPE: 0.6073


                                                                                                      

Train Loss: 14423451133480.2246 | Train MAE: 2411921.1017 | Train MAPE: 0.5883
Val Loss: 11195322577629.6836 | Val MAE: 2178022.6833 | Val MAPE: 0.5135
Test Loss: 14363722406344.1895 | Test MAE: 2341432.0616 | Test MAPE: 0.5941


                                                                                                       

Train Loss: 13925756957115.2383 | Train MAE: 2366258.8802 | Train MAPE: 0.5856
Val Loss: 10962242359145.3770 | Val MAE: 2112153.2361 | Val MAPE: 0.5195
Test Loss: 14239975460106.9199 | Test MAE: 2288943.7156 | Test MAPE: 0.6034


                                                                                                       

Train Loss: 13638247005872.8398 | Train MAE: 2335563.1204 | Train MAPE: 0.5809
Val Loss: 11321596359778.3906 | Val MAE: 2236742.8233 | Val MAPE: 0.4937
Test Loss: 14265360173104.5312 | Test MAE: 2380473.0687 | Test MAPE: 0.5668


                                                                                                       

Train Loss: 13525028642803.8574 | Train MAE: 2297836.2238 | Train MAPE: 0.5731
Val Loss: 10807887860980.1562 | Val MAE: 2117557.3238 | Val MAPE: 0.4949
Test Loss: 13873966036632.8711 | Test MAE: 2276626.3744 | Test MAPE: 0.5725


In [61]:
torch.save(model.state_dict(), 'model.pth')

In [63]:
important_cols = ['material', 'num_room', 'sub_area', 'year', 'full_sq', 'max_floor', 
                  'build_year', 'school_km', 'kremlin_km', 'floor', 'cafe_avg_price_5000', 'price_doc']
numeric_cols = ['num_room', 'year', 'full_sq', 'max_floor', 'build_year', 'school_km', 'kremlin_km', 'floor', 'cafe_avg_price_5000']
cat_cols = ['material', 'sub_area']

In [64]:
model.load_state_dict(torch.load('model.pth'))

# Выбор важных столбцов
important_cols = ['material', 'num_room', 'sub_area', 'year', 'full_sq', 'max_floor', 
                  'build_year', 'school_km', 'kremlin_km', 'floor', 'cafe_avg_price_5000', 'price_doc']
numeric_cols = ['num_room', 'year', 'full_sq', 'max_floor', 'build_year', 'school_km', 'kremlin_km', 'floor', 'cafe_avg_price_5000']
cat_cols = ['material', 'sub_area']

# Создание экземпляра кастомного датасета для первой строки
first_row_data = data[important_cols].iloc[[0]]

# Создание экземпляра кастомного датасета для первой строки
first_row_dataset = CustomDataset(first_row_data, numeric_cols, cat_cols, target_col='price_doc')

# Создание DataLoader для этого датасета (batch_size=1, так как у нас только одно наблюдение)
first_row_loader = DataLoader(first_row_dataset, batch_size=1, shuffle=False)

# Перевод модели в режим оценки (evaluation mode)
model.eval()

# Передача данных через модель и получение предсказанного значения
with torch.no_grad():
    for inputs, _ in first_row_loader:
        inputs = inputs.to('cuda')  # отправляем данные на устройство (GPU)
        output = model(inputs)  # получаем выход модели

# Получение предсказанного значения
predicted_price = output.item()
print("Predicted price:", predicted_price)


Predicted price: 5335407.5


In [69]:
dtrain = pd.read_csv('train.csv')
real_price = dtrain['price_doc'][0]
print(f'Реальная цена недвижимости: {real_price} рублей')
print(f'Предсказанная цена: {round(predicted_price)} рублей')

Реальная цена недвижимости: 5850000 рублей
Предсказанная цена: 5335408 рублей


In [71]:
print(f'Разница в предсказании - {real_price - round(predicted_price)} рублей')

Разница в предсказании - 514592 рублей
