In [19]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, accuracy_score


import warnings
warnings.filterwarnings("ignore")

np.random.seed(10)
pd.set_option('display.precision', 2)

In [20]:
df = pd.read_csv('C:/Users/Инна/Desktop/otus/AB_NYC_2019.csv')

In [21]:
#удалим ненужные данные

# Список столбцов для удаления (добавьте все нужные)
columns_to_drop = [col for col in df.columns 
                  if 'ID' in col
                   or 'Name' in col
                   or 'id' in col
                   or 'name' in col
                   or 'last_review' in col
                   or 'availability_365' in col
                  ]

# Удаляем выбранные столбцы
df_filtered = df.drop(columns=columns_to_drop)

print(f'''
Удалены столбцы:
{columns_to_drop}
''')
print(f'''
Оставшиеся столбцы:
{df_filtered.columns.tolist()}
''')


Удалены столбцы:
['id', 'name', 'host_id', 'host_name', 'last_review', 'availability_365']


Оставшиеся столбцы:
['neighbourhood_group', 'neighbourhood', 'latitude', 'longitude', 'room_type', 'price', 'minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count']



In [51]:
import math

def euclidean_distance(lat1, lon1, lat2, lon2):
    # Перевод градусов в метры (приближенно)
    dx = (lon2 - lon1) * 111320 * math.cos(math.radians((lat1 + lat2) / 2))
    dy = (lat2 - lat1) * 111320
    return math.sqrt(dx**2 + dy**2)

# Координаты центра (например, Times Square)
center_lat = 40.7580
center_lon = -73.9855

# Добавляем колонку 'distance_to_center' в метрах
df_filtered['distance_to_center'] = df_filtered.apply(
    lambda row: euclidean_distance(
        row['latitude'], 
        row['longitude'], 
        center_lat, 
        center_lon
    ), 
    axis=1
)

# Проверим результат
print(df_filtered[['latitude', 'longitude', 'distance_to_center']].sample(2))

       latitude  longitude  distance_to_center
13900      -0.5       0.56            9.04e+06
31730      -0.5      -0.29            8.97e+06


In [52]:
# Заполняем пропущенные значения в last_review, reviews_per_month  значением 'Unknown'

df_filtered['reviews_per_month'].fillna(0, inplace=True)

# Проверяем, что больше нет пропущенных значений
print(df_filtered.isnull().sum())


neighbourhood_group               0
neighbourhood                     0
latitude                          0
longitude                         0
room_type                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
reviews_per_month                 0
calculated_host_listings_count    0
distance_to_center                0
dtype: int64


In [53]:
#исключим аномалии

def remove_outliers(df_filtered, column):
    Q1 = df_filtered[column].quantile(0.25)
    Q3 = df_filtered[column].quantile(0.75)
    IQR = Q3 - Q1
    return df_filtered[(df_filtered[column] >= (Q1 - 1.5 * IQR)) & (df[column] <= (Q3 + 1.5 * IQR))]

df_filtered = remove_outliers(df_filtered, 'price')

In [54]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

In [None]:
categorical_features_indices_2 = [
    X.columns.get_loc(col) 
    for col in X.columns 
    if X[col].dtype == 'object' and col != 'room_type'
]
categorical_features_indices_2
df_filtered.columns[categorical_features_indices_2]


In [None]:
#преобразуем категориальные признаки (кроме room_type т.к. там необходимо устнаовить класс доставки по возрастанию - от низкого до высокого)
#сделать это до разделения на трейн и тест(!!!!!!!)
# df_filtered = pd.get_dummies(df_filtered, columns=['neighbourhood_group', 'neighbourhood'])
# df_filtered.head(3)

In [7]:
#преобразование категориальной переменной room

# import pandas as pd
# from sklearn.preprocessing import LabelEncoder

# # Выбор категориальных столбцов
# categorical_columns = ['room_type']

# # Создание и применение LabelEncoder
# label_encoders = {}
# for col in categorical_columns:
#     le = LabelEncoder()
#     df_filtered[col] = le.fit_transform(df_filtered[col])
#     label_encoders[col] = le  # Сохраняем кодировщики (опционально)

# # Просмотр первых строк преобразованного датафрейма
# print(df_filtered.head())

  neighbourhood_group neighbourhood  latitude  longitude  room_type  price  \
0            Brooklyn    Kensington     40.65     -73.97          1    149   
1           Manhattan       Midtown     40.75     -73.98          0    225   
2           Manhattan        Harlem     40.81     -73.94          1    150   
3            Brooklyn  Clinton Hill     40.69     -73.96          0     89   
4           Manhattan   East Harlem     40.80     -73.94          0     80   

   minimum_nights  number_of_reviews  reviews_per_month  \
0               1                  9               0.21   
1               1                 45               0.38   
2               3                  0               0.00   
3               1                270               4.64   
4              10                  9               0.10   

   calculated_host_listings_count  distance_to_center  
0                               6            12351.78  
1                               2              508.94  
2       

In [8]:
# df_filtered = pd.get_dummies(df_filtered, columns=['neighbourhood_group', 'neighbourhood'])

In [9]:
# df_filtered['room_type'].unique()

In [27]:
# ship_mode_mapping = {'Entire home/apt': 1, 'Private room': int(2), 'Shared room': 3}
# df_filtered['room_type'] = df_filtered['room_type'].map(ship_mode_mapping)
df_filtered.head(3)

Unnamed: 0,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,distance_to_center
0,Brooklyn,Kensington,40.65,-73.97,Private room,149,1,9,0.21,6,12351.78
1,Manhattan,Midtown,40.75,-73.98,Entire home/apt,225,1,45,0.38,2,508.94
2,Manhattan,Harlem,40.81,-73.94,Private room,150,3,0,0.0,1,6764.84


In [28]:
X = df_filtered.drop('price', axis=1)
y = df_filtered['price']

In [29]:
# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [44]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error


# Создание модели
model = XGBRegressor(
    enable_categorical=True,
    n_estimators=100,  # Количество деревьев
    max_depth=7,       # Глубина деревьев
    learning_rate=0.1, # Скорость обучения
    random_state=42,
    subsample=0.8
)

# Обучение
model.fit(X_train, y_train)

# Предсказание
y_pred = model.predict(X_test)

# Оценка качества
mse = mean_squared_error(y_test, y_pred)
print(f"MSE: {mse:.4f}")
# rmse = mean_squared_error(y_test, y_pred, squared=False)
# print("RMSE (sklearn):", rmse)
r2 = r2_score(y_test, y_pred)
print(f"r2: {r2:.4f}")

MSE: 0.2343
r2: 0.5529


In [45]:
#масштабирование числовых признаков

from sklearn.preprocessing import RobustScaler

# Предположим, что у вас есть данные для масштабирования (например, числовые столбцы)
numerical_cols = df_filtered.select_dtypes(include=['number']).columns

# Создаем объект RobustScaler
scaler = RobustScaler()

# Масштабируем данные
df_filtered[numerical_cols] = scaler.fit_transform(df_filtered[numerical_cols])
df_filtered.sample(3)

Unnamed: 0,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,distance_to_center
30232,Queens,Sunnyside,0.15,0.76,Shared room,-0.74,-0.25,1.35,1.22,2.0,-0.07
33843,Manhattan,Nolita,0.01,-0.83,Private room,-0.17,-0.25,-0.22,-0.24,7.0,-0.49
32215,Brooklyn,Flatbush,-0.96,-0.17,Private room,-0.05,-0.25,0.17,0.22,0.0,0.97


In [None]:
# print(df_scaled[numeric_cols].head())  # Вывод первых строк масштабированных данных
# print("\nСреднее значение после масштабирования:")
# print(df_scaled[numeric_cols].mean())  # Должно быть ~0
# print("\nСтандартное отклонение после масштабирования:")
# print(df_scaled[numeric_cols].std())   # Должно быть ~1

In [None]:
from sklearn.model_selection import GridSearchCV

# Параметры для перебора
param_grid = {
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [50, 100, 200],
    'subsample': [0.7, 0.8, 1.0]
}

# Поиск лучших параметров
grid = GridSearchCV(
    XGBRegressor(random_state=42),
    param_grid,
    cv=5,
    scoring='r2'
)
grid.fit(X_train, y_train)

# Лучшая модель
best_model = grid.best_estimator_
print("Лучшие параметры:", grid.best_params_)

In [48]:
# Кодирование категориальных переменных
categorical_cols = ['neighbourhood_group', 'neighbourhood', 'room_type']

# Преобразуем категориальные колонки в category dtype
for col in categorical_cols:
    df_filtered[col] = df_filtered[col].astype('category')
for col in categorical_cols:
    X_train[col] = X_train[col].astype('category')
    X_test[col] = X_test[col].astype('category')


In [49]:
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb 

# Параметры для перебора

param_grid = {
    'n_estimators': [100, 200],  # Number of boosting rounds
    'learning_rate': [0.01, 0.05, 0.1],  # Step size shrinkage
    'num_leaves': [20, 30, 40],  # Max number of leaves in one tree
    'max_depth': [-1, 5, 10]  # -1 means no limit.  Limit the max depth of tree to prevent over-fitting
    # 'min_child_samples': [20, 30, 40],  # Minimum number of data need in a child(leaf)
    # 'subsample': [0.8, 0.9, 1.0],  # Subsample ratio of the training instance.
    # 'colsample_bytree': [0.8, 0.9, 1.0], # Subsample ratio of columns when constructing each tree.
    # 'reg_alpha': [0.0, 0.1, 0.2], # L1 regularization term on weights
    # 'reg_lambda': [0.0, 0.1, 0.2], # L2 regularization term on weights
}

# Поиск лучших параметров
grid = GridSearchCV(
    lgb.LGBMRegressor(random_state=42),
    param_grid,
    cv=5,
    scoring='r2'
)
grid.fit(X_train, y_train)

# Лучшая модель
best_model = grid.best_estimator_
print("Лучшие параметры:", grid.best_params_)

Лучшие параметры: {'learning_rate': 0.05, 'max_depth': -1, 'n_estimators': 200, 'num_leaves': 40}


In [42]:
from sklearn.model_selection import train_test_split

X = df_filtered.drop('price', axis=1)
y = df_filtered['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [50]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

# Создаем Dataset
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_cols)

# Параметры модели
params = {
    'objective': 'regression', # для регрессии (если цель — цена)
    'max_depth': -1,
    'n_estimators': 200,
    'metric': 'rmse',           # метрика качества
    'boosting_type': 'gbdt',    # тип бустинга
    'num_leaves': 40,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'verbose': -1,
    
}

# Обучение модели
model = lgb.train(params, train_data, num_boost_round=100)

# Предсказание на тестовых данных
y_pred = model.predict(X_test)

# Оценка модели
# rmse = mean_squared_error(y_test, y_pred, squared=False)
# print(f'RMSE: {rmse}')
mse = mean_squared_error(y_test, y_pred)
print(f"MSE: {mse:.4f}")
r2 = r2_score(y_test, y_pred)
print(f"r2: {r2:.4f}")

MSE: 0.2312
r2: 0.5589


In [40]:
import matplotlib.pyplot as plt
from xgboost import plot_importance

# Визуализация важности признаков
plot_importance(best_model)
plt.show()

ValueError: tree must be Booster, XGBModel or dict instance