In [None]:
#This notebook contains the main implementation part of the real estate price prediction project.  
#Includes:
#- Loading and preparing input data (transformations, feature partitioning)
#- Building regression models:
#- Baseline model (average)
  #- Linear regression
  #- Random Forest Regressor
  #- XGBoost Regressor
#- Comparison of RMSE and R² metrics for each model
#- Analysis of prediction errors (residuals)
#- Hyperparameterization of the Random Forest and XGBoost models using GridSearchCV
#- Selecting the best model (XGBoost after optimization)
#- Final evaluation of prediction accuracy
#This notebook is a key stage of the project – it combines data preparation with data modeling and inference.

import pandas as pd
import numpy as np

df = pd.read_csv("gotowe_dane_do_modelu.csv")

df_encoded_clean = df.copy()

if 'LOG_PRICE' not in df.columns:
    df['LOG_PRICE'] = np.log1p(df['PRICE'])

if 'LUXURY_HOME' not in df.columns:
    df['LUXURY_HOME'] = (
        (df['PRICE'] > 2_000_000) &
        (df['PROPERTYSQFT'] > 3000) &
        (df['BATH'] >= 4) &
        (df['BEDS'] >= 5)
    ).astype(int)

if 'LOCATION_CATEGORY' not in df.columns:
    def assign_location_category(row):
        lat = row['LATITUDE']
        lon = row['LONGITUDE']
        if lat >= 40.77 and lon >= -73.98:
            return 'NorthEast'
        elif 40.72 <= lat < 40.77 and -74.00 <= lon <= -73.93:
            return 'Central'
        elif lat < 40.72 and lon < -73.93:
            return 'SouthWest'
        else:
            return 'Other'
    df['LOCATION_CATEGORY'] = df.apply(assign_location_category, axis=1)

In [None]:
missing_values = df.isnull().sum()
print(missing_values)

In [None]:
df_selected = df.drop(columns=[
    'BROKERTITLE', 'ADDRESS', 'STATE', 'MAIN_ADDRESS', 
    'ADMINISTRATIVE_AREA_LEVEL_2', 'SUBLOCALITY', 
    'STREET_NAME', 'LONG_NAME', 'FORMATTED_ADDRESS'
], errors='ignore')  

In [None]:
important_columns = ['PRICE', 'BEDS', 'BATH', 'PROPERTYSQFT', 'LATITUDE', 'LONGITUDE']
df_selected = df[important_columns]

df_encoded = df.copy()

In [None]:
print(df_encoded.head())

In [None]:
print("Liczba kolumn przed One-Hot Encoding:", df.shape[1])
print("Liczba kolumn po One-Hot Encoding:", df_encoded.shape[1])

In [None]:
print(df_encoded.columns)

In [None]:
print(df_encoded.isnull().sum())

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
sns.boxplot(x=df_encoded['PRICE'])
plt.title("Wykres pudełkowy dla ceny")
plt.show()

In [None]:
Q1 = df_encoded['PRICE'].quantile(0.25)
Q3 = df_encoded['PRICE'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df_encoded[(df_encoded['PRICE'] < lower_bound) | (df_encoded['PRICE'] > upper_bound)]
print(f"Liczba outlierów w cenach: {len(outliers)}")

In [None]:
df_encoded.to_csv('przetworzone_dane.csv', index=False)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")

plt.figure(figsize=(10, 5))
sns.histplot(df['PRICE'], bins=100, kde=True)
plt.title("Rozkład cen nieruchomości (oryginalne)")
plt.xlabel("Cena [USD]")
plt.ylabel("Liczba nieruchomości")
plt.xlim(0, df['PRICE'].quantile(0.95)) 
plt.show()

df['LOG_PRICE'] = np.log1p(df['PRICE'])

plt.figure(figsize=(10, 5))
sns.histplot(df['LOG_PRICE'], bins=100, kde=True, color='green')
plt.title("Rozkład cen nieruchomości (log-transformed)")
plt.xlabel("log(1 + Cena) [USD]")
plt.ylabel("Liczba nieruchomości")
plt.show()

In [None]:
print("Min cena:", df['PRICE'].min())
print("Max cena:", df['PRICE'].max())
print("99 percentyl:", df['PRICE'].quantile(0.99))

print("\nMin log-cena:", df['LOG_PRICE'].min())
print("Max log-cena:", df['LOG_PRICE'].max())

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Q1 = df['LOG_PRICE'].quantile(0.25)
Q3 = df['LOG_PRICE'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print(f"Lower bound: {lower_bound:.2f}, Upper bound: {upper_bound:.2f}")

df_clean = df[(df['LOG_PRICE'] >= lower_bound) & (df['LOG_PRICE'] <= upper_bound)]
print(f"Liczba obserwacji przed czyszczeniem: {df.shape[0]}")
print(f"Liczba obserwacji po czyszczeniu: {df_clean.shape[0]}")

plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
sns.boxplot(x=df['LOG_PRICE'], color='skyblue')
plt.title("Boxplot LOG_PRICE przed czyszczeniem")
plt.xlabel("log(1 + PRICE)")

plt.subplot(1, 2, 2)
sns.boxplot(x=df_clean['LOG_PRICE'], color='lightgreen')
plt.title("Boxplot LOG_PRICE po czyszczeniu")
plt.xlabel("log(1 + PRICE)")

plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

cols_to_check = ['BEDS', 'BATH']

for col in cols_to_check:
    Q1 = df_clean[col].quantile(0.25)
    Q3 = df_clean[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    lower_outliers = df_clean[df_clean[col] < lower_bound]
    upper_outliers = df_clean[df_clean[col] > upper_bound]
    
    print(f"=== {col} ===")
    print(f"Q1 = {Q1}, Q3 = {Q3}, IQR = {IQR}")
    print(f"Lower bound: {lower_bound:.2f}, Upper bound: {upper_bound:.2f}")
    print(f"Liczba dolnych outlierów: {len(lower_outliers)}")
    print(f"Liczba górnych outlierów: {len(upper_outliers)}\n")

    plt.figure(figsize=(8, 2))
    sns.boxplot(x=df_clean[col], color='lightblue', fliersize=5)
    plt.title(f"Boxplot dla {col} (z widocznymi outlierami)")
    plt.xlabel(col)
    plt.show()

In [None]:
cols_to_check = ['BEDS', 'BATH']

for col in cols_to_check:
    print(f"=== Statystyki opisowe: {col} ===")
    print(df_clean[col].describe())
    print("\nRozkład wartości:")
    print(df_clean[col].value_counts().sort_index())
    
    print("\nWartości odstające (powyżej 95. percentyla):")
    threshold_95 = df_clean[col].quantile(0.95)
    outliers = df_clean[df_clean[col] > threshold_95][col]
    print(outliers.value_counts().sort_index())
    
    print("-" * 50)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df_beds_bath_before = df_clean.copy()

cols_to_clean = ['BEDS', 'BATH']

for col in cols_to_clean:
    Q1 = df_clean[col].quantile(0.25)
    Q3 = df_clean[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    print(f"=== {col} ===")
    print(f"IQR: {IQR:.2f}, Lower bound: {lower_bound:.2f}, Upper bound: {upper_bound:.2f}")
    before = df_clean.shape[0]
    df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]
    after = df_clean.shape[0]
    print(f"Usunięto {before - after} obserwacji dla {col}\n")

for col in cols_to_clean:
    plt.figure(figsize=(12, 4))

    plt.subplot(1, 2, 1)
    sns.boxplot(x=df_beds_bath_before[col], color='skyblue')
    plt.title(f"{col} - przed czyszczeniem")
    plt.xlabel(col)

    plt.subplot(1, 2, 2)
    sns.boxplot(x=df_clean[col], color='lightgreen')
    plt.title(f"{col} - po czyszczeniu (IQR)")
    plt.xlabel(col)

    plt.tight_layout()
    plt.show()

In [None]:
print("=== Statystyki opisowe: PROPERTYSQFT ===")
print(df_clean['PROPERTYSQFT'].describe())

print("\nRozkład przykładowych wartości:")
print(df_clean['PROPERTYSQFT'].value_counts().sort_index().tail(20))  

threshold_95 = df_clean['PROPERTYSQFT'].quantile(0.95)
outliers = df_clean[df_clean['PROPERTYSQFT'] > threshold_95]['PROPERTYSQFT']

print(f"\n95. percentyl: {threshold_95}")
print(f"Liczba obserwacji powyżej 95. percentyla: {len(outliers)}")

print("\nNajwiększe wartości (TOP 10):")
print(outliers.sort_values(ascending=False).head(10))

In [None]:
df_clean = df_encoded_clean.copy()

df_sqft_before = df_clean.copy()

Q1 = df_clean['PROPERTYSQFT'].quantile(0.25)
Q3 = df_clean['PROPERTYSQFT'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print(f"IQR = {IQR}")
print(f"Lower bound = {lower_bound}, Upper bound = {upper_bound}")

before = df_clean.shape[0]
df_clean = df_clean[(df_clean['PROPERTYSQFT'] >= lower_bound) & (df_clean['PROPERTYSQFT'] <= upper_bound)]
after = df_clean.shape[0]

print(f"Usunięto {before - after} obserwacji")

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
sns.boxplot(x=df_sqft_before['PROPERTYSQFT'], color='skyblue')
plt.title("PROPERTYSQFT - przed czyszczeniem")
plt.xlabel("Metraż [sqft]")

plt.subplot(1, 2, 2)
sns.boxplot(x=df_clean['PROPERTYSQFT'], color='lightgreen')
plt.title("PROPERTYSQFT - po czyszczeniu (IQR)")
plt.xlabel("Metraż [sqft]")

plt.tight_layout()
plt.show()

In [None]:
for col in ['LATITUDE', 'LONGITUDE']:
    print(f"=== Statystyki opisowe: {col} ===")
    print(df_clean[col].describe())

    Q1 = df_clean[col].quantile(0.25)
    Q3 = df_clean[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    print(f"\nIQR: {IQR:.6f}")
    print(f"Lower bound: {lower_bound:.6f}")
    print(f"Upper bound: {upper_bound:.6f}")

    outliers_low = df_clean[df_clean[col] < lower_bound]
    outliers_high = df_clean[df_clean[col] > upper_bound]

    print(f"Liczba outlierów poniżej: {len(outliers_low)}")
    print(f"Liczba outlierów powyżej: {len(outliers_high)}")
    print("-" * 50)

In [None]:
df_geo_before = df_clean.copy()

long_lower_bound = -74.171705
before = df_clean.shape[0]
df_clean = df_clean[df_clean['LONGITUDE'] >= long_lower_bound]
after = df_clean.shape[0]

print(f"Usunięto {before - after} obserwacji z LONGITUDE poniżej {long_lower_bound}")

plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
sns.boxplot(x=df_geo_before['LONGITUDE'], color='skyblue')
plt.title("LONGITUDE - przed czyszczeniem")
plt.xlabel("LONGITUDE")

plt.subplot(1, 2, 2)
sns.boxplot(x=df_clean['LONGITUDE'], color='lightgreen')
plt.title("LONGITUDE - po czyszczeniu")
plt.xlabel("LONGITUDE")

plt.tight_layout()
plt.show()

In [None]:
print("Kolumny w df_clean:")
print(df_clean.columns.tolist())

In [None]:
print("=== Lokalizacje (LOCALITY) ===")
locality_cols = [col for col in df_clean.columns if col.startswith("LOCALITY_")]

for col in locality_cols:
    count = df_clean[col].sum()
    print(f"{col}: {int(count)} obserwacji")

In [None]:
print("\n=== Typy nieruchomości (TYPE) ===")
type_cols = [col for col in df_clean.columns if col.startswith("TYPE_")]

for col in type_cols:
    count = df_clean[col].sum()
    print(f"{col}: {int(count)} obserwacji")

In [None]:
to_remove_locality = ['LOCALITY_United States', 'LOCALITY_The Bronx']
existing_locality_cols = [col for col in to_remove_locality if col in df_clean.columns]

if existing_locality_cols:
    rows_to_remove_locality = df_clean[df_clean[existing_locality_cols].sum(axis=1) > 0].index
    print(f"Usuwamy {len(rows_to_remove_locality)} obserwacji z lokalizacji: {existing_locality_cols}")
    df_clean.drop(index=rows_to_remove_locality, inplace=True)
    df_clean.drop(columns=existing_locality_cols, inplace=True)

to_remove_type = ['TYPE_Foreclosure', 'TYPE_Contingent']
existing_type_cols = [col for col in to_remove_type if col in df_clean.columns]

if existing_type_cols:
    rows_to_remove_type = df_clean[df_clean[existing_type_cols].sum(axis=1) > 0].index
    print(f"Usuwamy {len(rows_to_remove_type)} obserwacji z typem: {existing_type_cols}")
    df_clean.drop(index=rows_to_remove_type, inplace=True)
    df_clean.drop(columns=existing_type_cols, inplace=True)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

locality_cols = [col for col in df_clean.columns if col.startswith('LOCALITY_')]
df_locality = df_clean[locality_cols + ['LOG_PRICE']].copy()

df_locality['LOCALITY'] = df_locality[locality_cols].idxmax(axis=1).str.replace('LOCALITY_', '')

locality_price = df_locality.groupby('LOCALITY')['LOG_PRICE'].mean().sort_values(ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x=locality_price.values, y=locality_price.index, palette='Blues_r')
plt.title("Średnia log-cena nieruchomości wg lokalizacji")
plt.xlabel("Średnia log(1 + PRICE)")
plt.ylabel("Lokalizacja")
plt.show()

type_cols = [col for col in df_clean.columns if col.startswith('TYPE_')]
df_type = df_clean[type_cols + ['LOG_PRICE']].copy()
df_type['TYPE'] = df_type[type_cols].idxmax(axis=1).str.replace('TYPE_', '')

type_price = df_type.groupby('TYPE')['LOG_PRICE'].mean().sort_values(ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x=type_price.values, y=type_price.index, palette='Greens_r')
plt.title("Średnia log-cena nieruchomości wg typu")
plt.xlabel("Średnia log(1 + PRICE)")
plt.ylabel("Typ nieruchomości")
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

num_cols = ['LOG_PRICE', 'BEDS', 'BATH', 'PROPERTYSQFT', 'LATITUDE', 'LONGITUDE']

corr_matrix = df_clean[num_cols].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(
    corr_matrix,
    annot=True,
    fmt=".2f",
    cmap='coolwarm',
    square=True,
    linewidths=0.5
)
plt.title("Macierz korelacji cech numerycznych")
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 8))
scatter = plt.scatter(
    df_clean['LONGITUDE'],
    df_clean['LATITUDE'],
    c=df_clean['LOG_PRICE'],
    cmap='viridis',        
    alpha=0.6,
    edgecolors='k',
    linewidths=0.2
)
plt.colorbar(scatter, label='Log(1 + Cena nieruchomości)')
plt.title("Mapa rozmieszczenia nieruchomości w Nowym Jorku (wg ceny)")
plt.xlabel("Długość geograficzna")
plt.ylabel("Szerokość geograficzna")
plt.grid(True)
plt.show()

In [None]:
df_clean.to_csv("oczyszczone_dane.csv", index=False)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

type_cols = [col for col in df_clean.columns if col.startswith("TYPE_")]
df_type_geo = df_clean[type_cols + ['LATITUDE', 'LONGITUDE']].copy()

df_type_geo['TYPE'] = df_type_geo[type_cols].idxmax(axis=1).str.replace('TYPE_', '')

plt.figure(figsize=(12, 8))
sns.scatterplot(
    data=df_type_geo,
    x='LONGITUDE',
    y='LATITUDE',
    hue='TYPE',
    alpha=0.7,
    palette='Set2',
    edgecolor='k',
    linewidth=0.2
)

In [None]:
df_encoded = df.copy()

In [None]:
type_cols = [col for col in df_encoded.columns if col.startswith('TYPE_')]

df_encoded['TYPE'] = df_encoded[type_cols].idxmax(axis=1).str.replace('TYPE_', '')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 8))
sns.scatterplot(
    data=df_encoded,
    x='LONGITUDE',
    y='LATITUDE',
    hue='TYPE',
    alpha=0.7,
    palette='Set2',
    edgecolor='k',
    linewidth=0.2
)
plt.title("Rozmieszczenie nieruchomości wg typu (LATITUDE vs LONGITUDE)")
plt.xlabel("Długość geograficzna")
plt.ylabel("Szerokość geograficzna")
plt.legend(title='Typ nieruchomości', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
type_price_stats = df_encoded.groupby('TYPE')[['PRICE', 'LOG_PRICE']].mean().sort_values(by='LOG_PRICE', ascending=False)

type_price_stats = type_price_stats.round(2)
print(type_price_stats)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
sns.barplot(
    x=type_price_stats['LOG_PRICE'],
    y=type_price_stats.index,
    palette='coolwarm'
)
plt.title("Średnia log-cena wg typu nieruchomości")
plt.xlabel("Średnia LOG_PRICE")
plt.ylabel("Typ nieruchomości")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

type_price_stats = df_encoded.groupby('TYPE')['LOG_PRICE'].mean().sort_values(ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(
    x=type_price_stats.values,
    y=type_price_stats.index,
    palette='magma'
)
plt.title("Średnia log-cena nieruchomości wg typu")
plt.xlabel("Średnia LOG_PRICE")
plt.ylabel("Typ nieruchomości")
plt.grid(True, axis='x')
plt.tight_layout()
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

type_price_stats = df_encoded.groupby('TYPE')['LOG_PRICE'].mean().sort_values(ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(
    x=type_price_stats.index,        
    y=type_price_stats.values,       
    palette='viridis'
)
plt.title("Średnia log-cena nieruchomości wg typu")
plt.xlabel("Typ nieruchomości")
plt.ylabel("Średnia log(1 + PRICE)")
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y')
plt.tight_layout()
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

type_price_stats = df_encoded.groupby('TYPE')['LOG_PRICE'].mean().sort_values(ascending=False)

plt.figure(figsize=(10, 6))
ax = sns.barplot(
    x=type_price_stats.index,
    y=type_price_stats.values,
    color='steelblue',   
    width=0.6            
)

for i, v in enumerate(type_price_stats.values):
    ax.text(i, v + 0.05, f"{v:.2f}", ha='center', va='bottom', fontsize=9)

# Stylizacja
plt.title("Średnia log-cena nieruchomości wg typu", fontsize=14)
plt.xlabel("Typ nieruchomości")
plt.ylabel("Średnia log(1 + PRICE)")
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

type_price_stats = df_encoded.groupby('TYPE')['PRICE'].mean().sort_values(ascending=False)

plt.figure(figsize=(10, 6))
ax = sns.barplot(
    x=type_price_stats.index,
    y=type_price_stats.values,
    color='steelblue',
    width=0.6
)

for i, v in enumerate(type_price_stats.values):
    ax.text(i, v + 50000, f"${v:,.0f}", ha='center', va='bottom', fontsize=9)

plt.title("Średnia cena nieruchomości wg typu", fontsize=14)
plt.xlabel("Typ nieruchomości")
plt.ylabel("Średnia cena (USD)")
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

In [None]:
luxury_types = ['For sale', 'Townhouse for sale', 'Mobile house for sale']

df_encoded['LUXURY_HOME'] = (
    (df_encoded['PRICE'] > 1_500_000) &
    (df_encoded['PROPERTYSQFT'] > 2500) &
    (df_encoded['BATH'] >= 3) &
    (df_encoded['BEDS'] >= 4) &
    (df_encoded['TYPE'].isin(luxury_types))
).astype(int)

print("Liczba luksusowych nieruchomości:", df_encoded['LUXURY_HOME'].sum())
print("Łączna liczba nieruchomości:", len(df_encoded))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 8))
sns.scatterplot(
    data=df_encoded,
    x='LONGITUDE',
    y='LATITUDE',
    hue='LUXURY_HOME',
    palette={0: 'lightgray', 1: 'darkred'},
    alpha=0.7,
    edgecolor='k',
    linewidth=0.2,
    s=60
)
plt.title("Rozmieszczenie luksusowych nieruchomości (LUXURY_HOME = 1)")
plt.xlabel("Długość geograficzna")
plt.ylabel("Szerokość geograficzna")
plt.legend(title="LUXURY_HOME", loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
df_encoded.groupby('LUXURY_HOME')[['PRICE', 'PROPERTYSQFT', 'BATH', 'BEDS']].mean().round(0)

In [None]:
df_encoded['PRICE_PER_SQFT'] = df_encoded['PRICE'] / df_encoded['PROPERTYSQFT']

print(df_encoded[['PRICE', 'PROPERTYSQFT', 'PRICE_PER_SQFT']].head())

In [None]:
price_per_sqft_stats = df_encoded['PRICE_PER_SQFT'].describe()
print(price_per_sqft_stats)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

Q1 = df_encoded['PRICE_PER_SQFT'].quantile(0.25)
Q3 = df_encoded['PRICE_PER_SQFT'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print(f"Lower bound: {lower_bound:.2f}, Upper bound: {upper_bound:.2f}")

df_encoded_clean = df_encoded[
    (df_encoded['PRICE_PER_SQFT'] >= lower_bound) &
    (df_encoded['PRICE_PER_SQFT'] <= upper_bound)
]

print(f"Liczba obserwacji przed czyszczeniem: {len(df_encoded)}")
print(f"Liczba obserwacji po czyszczeniu: {len(df_encoded_clean)}")
print(f"Usunięto: {len(df_encoded) - len(df_encoded_clean)} obserwacji")

plt.figure(figsize=(12, 6))

sns.kdeplot(df_encoded['PRICE_PER_SQFT'], label='Przed czyszczeniem', fill=True, color='skyblue', linewidth=2)

sns.kdeplot(df_encoded_clean['PRICE_PER_SQFT'], label='Po czyszczeniu', fill=True, color='darkgreen', linewidth=2)

plt.title("Rozkład PRICE_PER_SQFT przed i po czyszczeniu outlierów")
plt.xlabel("Cena za stopę kwadratową")
plt.ylabel("Gęstość")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
import numpy as np

df_encoded_clean['LOG_PRICE_PER_SQFT'] = np.log1p(df_encoded_clean['PRICE_PER_SQFT'])

print(df_encoded_clean[['PRICE_PER_SQFT', 'LOG_PRICE_PER_SQFT']].head())

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
sns.histplot(df_encoded_clean['LOG_PRICE_PER_SQFT'], bins=50, kde=True, color='mediumseagreen')
plt.title("Rozkład LOG_PRICE_PER_SQFT (log(1 + cena za stopę kwadratową))")
plt.xlabel("log(1 + PRICE_PER_SQFT)")
plt.ylabel("Liczba nieruchomości")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.scatterplot(
    data=df_encoded_clean,
    x='LOG_PRICE_PER_SQFT',
    y='PRICE',
    alpha=0.6,
    color='slateblue'
)
plt.title("Zależność LOG_PRICE_PER_SQFT od ceny nieruchomości (PRICE)")
plt.xlabel("LOG_PRICE_PER_SQFT")
plt.ylabel("PRICE (USD)")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
correlation = df_encoded_clean[['LOG_PRICE_PER_SQFT', 'PRICE']].corr().iloc[0, 1]
print(f"Korelacja między LOG_PRICE_PER_SQFT a PRICE: {correlation:.2f}")

In [None]:
df_encoded_clean['ROOMS_PER_SQFT'] = df_encoded_clean['BEDS'] / (df_encoded_clean['PROPERTYSQFT'] + 1e-5)

print(df_encoded_clean[['BEDS', 'PROPERTYSQFT', 'ROOMS_PER_SQFT']].head())

In [None]:
print(df_encoded_clean['ROOMS_PER_SQFT'].describe())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.histplot(df_encoded_clean['ROOMS_PER_SQFT'], bins=50, kde=True, color='darkcyan')
plt.title("Rozkład ROOMS_PER_SQFT")
plt.xlabel("Liczba sypialni na stopę kwadratową")
plt.ylabel("Liczba nieruchomości")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
Q1 = df_encoded_clean['ROOMS_PER_SQFT'].quantile(0.25)
Q3 = df_encoded_clean['ROOMS_PER_SQFT'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df_encoded_clean = df_encoded_clean[
    (df_encoded_clean['ROOMS_PER_SQFT'] >= lower_bound) &
    (df_encoded_clean['ROOMS_PER_SQFT'] <= upper_bound)
]

print(f"Zakres czyszczenia: {lower_bound:.6f} do {upper_bound:.6f}")

In [None]:
print(f"Liczba obserwacji po czyszczeniu: {len(df_encoded_clean)}")

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df_encoded_clean['ROOMS_PER_SQFT'], bins=50, kde=True, color='darkslateblue')
plt.title("Rozkład ROOMS_PER_SQFT po usunięciu outlierów")
plt.xlabel("Liczba sypialni na stopę kwadratową")
plt.ylabel("Liczba nieruchomości")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
luxury_comparison = df_encoded_clean.groupby('LUXURY_HOME')['ROOMS_PER_SQFT'].describe()
print(luxury_comparison)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8, 5))
sns.boxplot(
    data=df_encoded_clean,
    x='LUXURY_HOME',
    y='ROOMS_PER_SQFT',
    hue='LUXURY_HOME',
    palette={0: 'lightgray', 1: 'gold'},
    legend=False
)
plt.title("ROOMS_PER_SQFT: Luksusowe vs. pozostałe nieruchomości")
plt.xlabel("LUXURY_HOME")
plt.ylabel("Liczba sypialni na stopę kwadratową")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8, 5))
sns.boxplot(
    data=df_encoded_clean,
    x='LUXURY_HOME',
    y='ROOMS_PER_SQFT',
    color='lightgray'  
)
plt.title("ROOMS_PER_SQFT: Luksusowe vs. pozostałe nieruchomości")
plt.xlabel("LUXURY_HOME")
plt.ylabel("Liczba sypialni na stopę kwadratową")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
print(df_encoded_clean[['LATITUDE', 'LONGITUDE']].head())

In [None]:
print("Zakres LATITUDE:")
print(df_encoded_clean['LATITUDE'].describe())

print("\nZakres LONGITUDE:")
print(df_encoded_clean['LONGITUDE'].describe())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 8))
sns.scatterplot(
    data=df_encoded_clean,
    x='LONGITUDE',
    y='LATITUDE',
    alpha=0.4,
    edgecolor=None
)
plt.title("Rozmieszczenie nieruchomości wg współrzędnych geograficznych")
plt.xlabel("LONGITUDE")
plt.ylabel("LATITUDE")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
def assign_location_category(row):
    lat = row['LATITUDE']
    lon = row['LONGITUDE']

    if 40.70 <= lat <= 40.80 and -74.00 <= lon <= -73.93:
        return 'Central'
    
    elif lat > 40.80 and lon > -74.00:
        return 'NorthEast'
    
    elif lat < 40.70 and lon < -73.90:
        return 'SouthWest'
    
    else:
        return 'Other'

df_encoded_clean['LOCATION_CATEGORY'] = df_encoded_clean.apply(assign_location_category, axis=1)

print(df_encoded_clean['LOCATION_CATEGORY'].value_counts())

In [None]:
def classify_borough(row):
    lat = row['LATITUDE']
    lon = row['LONGITUDE']
    
    if 40.70 <= lat <= 40.88 and -74.02 <= lon <= -73.92:
        return 'Manhattan'
    elif 40.57 <= lat <= 40.73 and -74.05 <= lon <= -73.85:
        return 'Brooklyn'
    elif 40.65 <= lat <= 40.85 and -73.87 <= lon <= -73.70:
        return 'Queens'
    elif lat > 40.80 and lon < -73.90:
        return 'Bronx'
    elif lat < 40.61 and lon < -74.05:
        return 'Staten Island'
    else:
        return 'Other'

df_encoded_clean['BOROUGH'] = df_encoded_clean.apply(classify_borough, axis=1)

print(df_encoded_clean['BOROUGH'].value_counts())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 8))
sns.scatterplot(
    data=df_encoded_clean,
    x='LONGITUDE',
    y='LATITUDE',
    hue='BOROUGH',
    palette='Set2',
    alpha=0.6,
    edgecolor=None
)

plt.title("Rozmieszczenie nieruchomości wg dzielnic Nowego Jorku (BOROUGH)")
plt.xlabel("LONGITUDE")
plt.ylabel("LATITUDE")
plt.grid(True)
plt.legend(title="Dzielnica", loc='best')
plt.tight_layout()
plt.show()

In [None]:
pip install folium

In [None]:
import folium
from folium.plugins import MarkerCluster

ny_center = [40.7128, -74.0060]
m = folium.Map(location=ny_center, zoom_start=11)

marker_cluster = MarkerCluster().add_to(m)

borough_colors = {
    'Manhattan': 'red',
    'Brooklyn': 'blue',
    'Queens': 'green',
    'Bronx': 'purple',
    'Staten Island': 'orange',
    'Other': 'gray'
}

for _, row in df_encoded_clean.iterrows():
    lat = row['LATITUDE']
    lon = row['LONGITUDE']
    borough = row.get('BOROUGH', 'Other')
    color = borough_colors.get(borough, 'gray')

    folium.CircleMarker(
        location=[lat, lon],
        radius=3,
        color=color,
        fill=True,
        fill_color=color,
        fill_opacity=0.7,
        popup=f"{borough}"
    ).add_to(marker_cluster)

m.save("mapa_nieruchomosci.html")

In [None]:
df_encoded_clean = df_encoded.copy()

In [None]:
import folium
from folium.plugins import MarkerCluster

ny_center = [40.7128, -74.0060]
m = folium.Map(location=ny_center, zoom_start=11)

marker_cluster = MarkerCluster().add_to(m)

borough_colors = {
    'Manhattan': 'red',
    'Brooklyn': 'blue',
    'Queens': 'green',
    'Bronx': 'purple',
    'Staten Island': 'orange',
    'Other': 'gray'
}

for _, row in df_encoded_clean.iterrows():
    lat = row['LATITUDE']
    lon = row['LONGITUDE']
    borough = row.get('BOROUGH', 'Other')
    color = borough_colors.get(borough, 'gray')

    folium.CircleMarker(
        location=[lat, lon],
        radius=3,
        color=color,
        fill=True,
        fill_color=color,
        fill_opacity=0.7,
        popup=f"{borough}"
    ).add_to(marker_cluster)

m.save("mapa_nieruchomosci.html")
m  


In [None]:
df_encoded_clean.to_csv("gotowe_dane_do_modelu.csv", index=False)

In [None]:
import folium
from folium.plugins import MarkerCluster

ny_center = [40.7128, -74.0060]
m = folium.Map(location=ny_center, zoom_start=11, tiles='CartoDB positron') 

marker_cluster = MarkerCluster().add_to(m)

borough_colors = {
    'Manhattan': 'red',
    'Brooklyn': 'blue',
    'Queens': 'green',
    'Bronx': 'purple',
    'Staten Island': 'orange',
    'Other': 'gray'
}

for _, row in df_encoded_clean.iterrows():
    lat = row['LATITUDE']
    lon = row['LONGITUDE']
    borough = row.get('BOROUGH', 'Other')
    color = borough_colors.get(borough, 'gray')

    price = int(row['PRICE'])
    typ = row.get('TYPE_House for sale', 'Unknown') 
    popup = f"""
    <b>{borough}</b><br>
    Price: ${price:,.0f}<br>
    """

    folium.CircleMarker(
        location=[lat, lon],
        radius=4,
        color=color,
        fill=True,
        fill_color=color,
        fill_opacity=0.6,
        popup=popup
    ).add_to(marker_cluster)

legend_html = """
<div style="
    position: fixed; 
    bottom: 50px; left: 50px; width: 200px; height: 160px; 
    z-index:9999; font-size:14px;
    background-color: white;
    border:2px solid gray;
    border-radius: 8px;
    padding: 10px;
    box-shadow: 2px 2px 5px rgba(0,0,0,0.3);">
<b>Legenda: dzielnice</b><br>
<span style='color:red;'>●</span> Manhattan<br>
<span style='color:blue;'>●</span> Brooklyn<br>
<span style='color:green;'>●</span> Queens<br>
<span style='color:purple;'>●</span> Bronx<br>
<span style='color:orange;'>●</span> Staten Island<br>
<span style='color:gray;'>●</span> Inne
</div>
"""
m.get_root().html.add_child(folium.Element(legend_html))

m

In [None]:
import folium
from folium.plugins import MarkerCluster

ny_center = [40.7128, -74.0060]
m = folium.Map(location=ny_center, zoom_start=11, tiles='CartoDB positron')

all_properties = MarkerCluster(name='Wszystkie nieruchomości').add_to(m)

luxury_layer = folium.FeatureGroup(name=' Luksusowe nieruchomości', show=True)
m.add_child(luxury_layer)

borough_colors = {
    'Manhattan': 'red',
    'Brooklyn': 'blue',
    'Queens': 'green',
    'Bronx': 'purple',
    'Staten Island': 'orange',
    'Other': 'gray'
}

for _, row in df_encoded_clean.iterrows():
    lat = row['LATITUDE']
    lon = row['LONGITUDE']
    borough = row.get('BOROUGH', 'Other')
    luxury = row.get('LUXURY_HOME', 0)
    price = int(row['PRICE'])
    
    popup = f"""
    <b>{borough}</b><br>
    Cena: ${price:,.0f}<br>
    Luksus: {' Tak' if luxury == 1 else 'Nie'}
    """

    if luxury == 1:
        folium.CircleMarker(
            location=[lat, lon],
            radius=6,
            color='gold',
            fill=True,
            fill_color='gold',
            fill_opacity=0.9,
            popup=popup
        ).add_to(luxury_layer)
    else:
        color = borough_colors.get(borough, 'gray')
        folium.CircleMarker(
            location=[lat, lon],
            radius=3,
            color=color,
            fill=True,
            fill_color=color,
            fill_opacity=0.5,
            popup=popup
        ).add_to(all_properties)

legend_html = """
<div style="
    position: fixed; 
    bottom: 50px; left: 50px; width: 210px; height: 180px; 
    z-index:9999; font-size:14px;
    background-color: white;
    border:2px solid gray;
    border-radius: 8px;
    padding: 10px;
    box-shadow: 2px 2px 5px rgba(0,0,0,0.3);">
<b>Legenda: dzielnice</b><br>
<span style='color:red;'>●</span> Manhattan<br>
<span style='color:blue;'>●</span> Brooklyn<br>
<span style='color:green;'>●</span> Queens<br>
<span style='color:purple;'>●</span> Bronx<br>
<span style='color:orange;'>●</span> Staten Island<br>
<span style='color:gray;'>●</span> Inne<br>
<span style='color:gold;'>●</span> Luksusowe
</div>
"""
m.get_root().html.add_child(folium.Element(legend_html))

folium.LayerControl().add_to(m)

m

In [None]:
df_encoded_clean['LUXURY_HOME'].value_counts()

In [None]:
df_encoded_clean['LUXURY_HOME'] = (
    (df_encoded_clean['PRICE'] > 2_000_000) &
    (df_encoded_clean['PROPERTYSQFT'] > 3000) &
    (df_encoded_clean['BATH'] >= 4) &
    (df_encoded_clean['BEDS'] >= 5)
).astype(int)

In [None]:
df_encoded_clean['LUXURY_HOME'] = (
    (df_encoded_clean['PRICE'] > 2_000_000) &
    (df_encoded_clean['PROPERTYSQFT'] > 3000) &
    (df_encoded_clean['BATH'] >= 4) &
    (df_encoded_clean['BEDS'] >= 5)
).astype(int)

In [None]:
df_encoded_clean['LUXURY_HOME'].value_counts()

In [None]:
import folium
from folium.plugins import MarkerCluster

ny_center = [40.7128, -74.0060]
m = folium.Map(location=ny_center, zoom_start=11, tiles='CartoDB positron')

all_properties = MarkerCluster(name='Wszystkie nieruchomości').add_to(m)
luxury_layer = folium.FeatureGroup(name='Luksusowe nieruchomości', show=True)
m.add_child(luxury_layer)

borough_colors = {
    'Manhattan': 'red',
    'Brooklyn': 'blue',
    'Queens': 'green',
    'Bronx': 'purple',
    'Staten Island': 'orange',
    'Other': 'gray'
}

for _, row in df_encoded_clean.iterrows():
    lat = row['LATITUDE']
    lon = row['LONGITUDE']
    borough = row.get('BOROUGH', 'Other')
    luxury = row.get('LUXURY_HOME', 0)
    price = int(row['PRICE'])

    popup = f"""
    <b>{borough}</b><br>
    Cena: ${price:,.0f}<br>
    Luksus: {' Tak' if luxury == 1 else 'Nie'}
    """

    if luxury == 1:
        folium.CircleMarker(
            location=[lat, lon],
            radius=8,
            color='gold',
            fill=True,
            fill_color='gold',
            fill_opacity=0.95,
            popup=popup
        ).add_to(luxury_layer)
    else:
        color = borough_colors.get(borough, 'gray')
        folium.CircleMarker(
            location=[lat, lon],
            radius=3,
            color=color,
            fill=True,
            fill_color=color,
            fill_opacity=0.5,
            popup=popup
        ).add_to(all_properties)

legend_html = """
<div style="
    position: fixed; 
    bottom: 50px; left: 50px; width: 210px; height: 180px; 
    z-index:9999; font-size:14px;
    background-color: white;
    border:2px solid gray;
    border-radius: 8px;
    padding: 10px;
    box-shadow: 2px 2px 5px rgba(0,0,0,0.3);">
<b>Legenda: dzielnice</b><br>
<span style='color:red;'>●</span> Manhattan<br>
<span style='color:blue;'>●</span> Brooklyn<br>
<span style='color:green;'>●</span> Queens<br>
<span style='color:purple;'>●</span> Bronx<br>
<span style='color:orange;'>●</span> Staten Island<br>
<span style='color:gray;'>●</span> Inne<br>
<span style='color:gold;'>●</span> Luksusowe
</div>
"""
m.get_root().html.add_child(folium.Element(legend_html))

folium.LayerControl().add_to(m)
m

In [None]:
df_encoded_clean['LOCATION_CATEGORY'].value_counts()

In [None]:
def assign_location_category(row):
    lat = row['LATITUDE']
    lon = row['LONGITUDE']

    if lat >= 40.77 and lon >= -73.98:
        return 'NorthEast'
    elif 40.72 <= lat < 40.77 and -74.00 <= lon <= -73.93:
        return 'Central'
    elif lat < 40.72 and lon < -73.93:
        return 'SouthWest'
    else:
        return 'Other'

df_encoded_clean['LOCATION_CATEGORY'] = df_encoded_clean.apply(assign_location_category, axis=1)

In [None]:
df_encoded_clean['LOCATION_CATEGORY'].value_counts()

In [None]:
from sklearn.model_selection import train_test_split

X = df_encoded_clean.drop(columns=['PRICE', 'LOG_PRICE'])

y = df_encoded_clean['LOG_PRICE']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
print(f"Rozmiar całego zbioru: {X.shape}")
print(f"Zbiór treningowy: {X_train.shape}")
print(f"Zbiór testowy: {X_test.shape}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10,5))
sns.histplot(y_train, label='Train', kde=True, color='skyblue')
sns.histplot(y_test, label='Test', kde=True, color='orange')
plt.title("Rozkład LOG_PRICE w zbiorach treningowym i testowym")
plt.xlabel("LOG_PRICE")
plt.ylabel("Liczba nieruchomości")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

mean_train = y_train.mean()

y_pred_baseline = np.full_like(y_test, fill_value=mean_train)

rmse_baseline = np.sqrt(mean_squared_error(y_test, y_pred_baseline))
r2_baseline = r2_score(y_test, y_pred_baseline)

print(f"Baseline (średnia) RMSE: {rmse_baseline:.4f}")
print(f"Baseline R²: {r2_baseline:.4f}")

In [None]:
X.select_dtypes(include='object').columns

In [None]:
X_encoded = pd.get_dummies(X, columns=['BOROUGH', 'LOCATION_CATEGORY', 'TYPE'], drop_first=True)

In [None]:
print("Kolumny w X_train:")
print(X_train.columns.tolist())

print("\nKolumny w X_test:")
print(X_test.columns.tolist())

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

X_train_encoded = pd.get_dummies(X_train, columns=['BOROUGH', 'LOCATION_CATEGORY', 'TYPE'], drop_first=True)

X_test_encoded = pd.get_dummies(X_test, columns=['BOROUGH', 'LOCATION_CATEGORY', 'TYPE'], drop_first=True)

X_test_encoded = X_test_encoded.loc[:, ~X_test_encoded.columns.duplicated()]

X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

print(" Zgodność kolumn po reindexowaniu:", X_train_encoded.columns.equals(X_test_encoded.columns))

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

model_lr = LinearRegression()
model_lr.fit(X_train_encoded, y_train)

y_pred_lr = model_lr.predict(X_test_encoded)

rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
r2_lr = r2_score(y_test, y_pred_lr)

print(f" Regresja liniowa RMSE: {rmse_lr:.4f}")
print(f" Regresja liniowa R²: {r2_lr:.4f}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test, y=y_pred_lr, alpha=0.6, color='teal')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel("Rzeczywiste wartości LOG_PRICE")
plt.ylabel("Przewidywane LOG_PRICE")
plt.title(" Rzeczywiste vs. Przewidywane")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
residuals = y_test - y_pred_lr

plt.figure(figsize=(8, 5))
sns.histplot(residuals, bins=30, kde=True, color='purple')
plt.title(" Rozkład residuals (błędów predykcji)")
plt.xlabel("Residual = Rzeczywiste - Przewidywane")
plt.ylabel("Liczba obserwacji")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_encoded, y_train)

y_pred_rf = rf_model.predict(X_test_encoded)

rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)

print(f" Random Forest RMSE: {rmse_rf:.4f}")
print(f" Random Forest R²: {r2_rf:.4f}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(7, 6))
sns.scatterplot(x=y_test, y=y_pred_rf, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')  
plt.xlabel("Rzeczywiste LOG_PRICE")
plt.ylabel("Przewidywane LOG_PRICE")
plt.title("Random Forest: Rzeczywiste vs. Przewidywane")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
residuals_rf = y_test - y_pred_rf

plt.figure(figsize=(7, 5))
sns.histplot(residuals_rf, bins=50, kde=True, color="orange")
plt.axvline(0, color='red', linestyle='--')
plt.title("Random Forest: Rozkład błędów (residuals)")
plt.xlabel("Błąd predykcji (Rzeczywiste - Przewidywane)")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.ensemble import RandomForestRegressor

model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf.fit(X_train_encoded, y_train)
y_pred_rf = model_rf.predict(X_test_encoded)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf.fit(X_train_encoded, y_train)

y_pred_rf = model_rf.predict(X_test_encoded)

rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)

print(f" Random Forest RMSE: {rmse_rf:.4f}")
print(f" Random Forest R²: {r2_rf:.4f}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

importance_df = pd.DataFrame({
    'Cecha': X_train_encoded.columns,
    'Ważność': model_rf.feature_importances_
}).sort_values(by='Ważność', ascending=False).head(15)

sns.set(style="whitegrid")

plt.figure(figsize=(10, 6))
ax = sns.barplot(
    x='Ważność',
    y='Cecha',
    data=importance_df,
    palette='rocket'
)

for i, v in enumerate(importance_df['Ważność']):
    ax.text(v + 0.005, i, f"{v:.3f}", color='black', va='center')

plt.title(" Random Forest – Top 15 najważniejszych cech", fontsize=14)
plt.xlabel("Ważność cechy")
plt.ylabel("Cecha")
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import numpy as np

importances = model_rf.feature_importances_
features = X_train_encoded.columns

importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': importances
}).sort_values(by='Importance', ascending=False).head(15)  

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df, palette='viridis')
plt.title("Random Forest: Najważniejsze cechy")
plt.xlabel("Ważność cechy")
plt.ylabel("Cecha")
plt.tight_layout()
plt.show()

In [None]:
pip install xgboost

In [None]:
non_numeric_cols = X_train_encoded.select_dtypes(exclude=['number']).columns
print("Kolumny nienumeryczne:", list(non_numeric_cols))

In [None]:
X_train_encoded = X_train_encoded.apply(pd.to_numeric, errors='coerce')
X_test_encoded = X_test_encoded.apply(pd.to_numeric, errors='coerce')

In [None]:
print("Typy danych:", X_train_encoded.dtypes.unique())

In [None]:
print(X_train_encoded.columns.duplicated().sum())

In [None]:
X_train_encoded = X_train_encoded.loc[:, ~X_train_encoded.columns.duplicated()]
X_test_encoded = X_test_encoded.loc[:, X_train_encoded.columns]  

X_train_encoded = X_train_encoded.astype(np.float32)
X_test_encoded = X_test_encoded.astype(np.float32)

In [None]:
X_train_encoded = X_train_encoded.loc[:, ~X_train_encoded.columns.duplicated()]
X_test_encoded = X_test_encoded.loc[:, ~X_test_encoded.columns.duplicated()]

In [None]:
X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

In [None]:
X_train_encoded = X_train_encoded.astype(np.float32)
X_test_encoded = X_test_encoded.astype(np.float32)

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

xgb_model = xgb.XGBRegressor(n_estimators=100, random_state=42)
xgb_model.fit(X_train_encoded, y_train)

y_pred_xgb = xgb_model.predict(X_test_encoded)

rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f" XGBoost RMSE: {rmse_xgb:.4f}")
print(f" XGBoost R²: {r2_xgb:.4f}")

In [None]:
X_train_encoded = X_train_encoded.loc[:, ~X_train_encoded.columns.duplicated()]
X_test_encoded = X_test_encoded.loc[:, ~X_test_encoded.columns.duplicated()]

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

model_lr = LinearRegression()
model_lr.fit(X_train_encoded, y_train)

y_pred_lr = model_lr.predict(X_test_encoded)

rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
r2_lr = r2_score(y_test, y_pred_lr)

print(f" Regresja liniowa (po oczyszczeniu) RMSE: {rmse_lr:.4f}")
print(f" Regresja liniowa (po oczyszczeniu) R²: {r2_lr:.4f}")

In [None]:
X_train_encoded = X_train_encoded.loc[:, ~X_train_encoded.columns.duplicated()]
X_test_encoded = X_test_encoded.loc[:, ~X_test_encoded.columns.duplicated()]

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_encoded, y_train)

y_pred_rf = rf_model.predict(X_test_encoded)

rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)

print(f" Random Forest RMSE (po oczyszczeniu): {rmse_rf:.4f}")
print(f" Random Forest R² (po oczyszczeniu): {r2_rf:.4f}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(7, 6))
sns.scatterplot(x=y_test, y=y_pred_xgb, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel("Rzeczywista wartość (LOG_PRICE)")
plt.ylabel("Przewidywana wartość (XGBoost)")
plt.title("XGBoost: Rzeczywiste vs. Przewidywane")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
residuals = y_test - y_pred_xgb

plt.figure(figsize=(8, 5))
sns.histplot(residuals, bins=50, kde=True, color='darkorange')
plt.title(" XGBoost: Rozkład błędów (Residuals)")
plt.xlabel("Błąd predykcji (y_test - y_pred)")
plt.ylabel("Liczba nieruchomości")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

importance_xgb_df = pd.DataFrame({
    'Cecha': X_train_encoded.columns,
    'Ważność': xgb_model.feature_importances_
}).sort_values(by='Ważność', ascending=False).head(15)

sns.set(style="whitegrid")

plt.figure(figsize=(10, 6))
ax = sns.barplot(
    x='Ważność',
    y='Cecha',
    data=importance_xgb_df,
    palette='rocket' 
)

for i, v in enumerate(importance_xgb_df['Ważność']):
    ax.text(v + 0.002, i, f"{v:.3f}", color='black', va='center')

plt.title(" XGBoost – Top 15 najważniejszych cech", fontsize=14)
plt.xlabel("Ważność cechy")
plt.ylabel("Cecha")
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

residuals_lr = y_test - y_pred_lr
residuals_rf = y_test - y_pred_rf
residuals_xgb = y_test - y_pred_xgb

plt.figure(figsize=(10, 6))
sns.histplot(residuals_lr, color='skyblue', label='Regresja Liniowa', kde=True, stat='density', bins=50)
sns.histplot(residuals_rf, color='forestgreen', label='Random Forest', kde=True, stat='density', bins=50)
sns.histplot(residuals_xgb, color='orange', label='XGBoost', kde=True, stat='density', bins=50)

plt.axvline(0, color='black', linestyle='--', linewidth=1)
plt.title(" Porównanie błędów predykcji (Residuals) – 3 modele")
plt.xlabel("Residual (y_true - y_pred)")
plt.ylabel("Gęstość (density)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

rf = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,
    scoring='neg_root_mean_squared_error',  
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train_encoded, y_train)

print(" Najlepsze parametry:", grid_search.best_params_)

best_rf_model = grid_search.best_estimator_
y_pred_best_rf = best_rf_model.predict(X_test_encoded)

from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

rmse_best_rf = np.sqrt(mean_squared_error(y_test, y_pred_best_rf))
r2_best_rf = r2_score(y_test, y_pred_best_rf)

print(f" Random Forest (po optymalizacji) RMSE: {rmse_best_rf:.4f}")
print(f" Random Forest (po optymalizacji) R²: {r2_best_rf:.4f}")

In [None]:
X_train_encoded = X_train_encoded.loc[:, ~X_train_encoded.columns.duplicated()]
X_test_encoded = X_test_encoded.loc[:, ~X_test_encoded.columns.duplicated()]

In [None]:
import sklearn
print(sklearn.__version__)

In [None]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 1.0]
}

xgb_model = xgb.XGBRegressor(
    random_state=42,
    objective='reg:squarederror',
    verbosity=0
)

grid_search_xgb = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    cv=5,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
    verbose=2
)

grid_search_xgb.fit(X_train_encoded, y_train)

print(" Najlepsze parametry:", grid_search_xgb.best_params_)

best_xgb_model = grid_search_xgb.best_estimator_
y_pred_best_xgb = best_xgb_model.predict(X_test_encoded)

rmse_best_xgb = np.sqrt(mean_squared_error(y_test, y_pred_best_xgb))
r2_best_xgb = r2_score(y_test, y_pred_best_xgb)

print(f" XGBoost (po optymalizacji) RMSE: {rmse_best_xgb:.4f}")
print(f" XGBoost (po optymalizacji) R²: {r2_best_xgb:.4f}")

In [None]:
import matplotlib.pyplot as plt

model_names = [
    "Regresja liniowa",
    "Random Forest",
    "Random Forest (opt)",
    "XGBoost",
    "XGBoost (opt)"
]

rmse_values = [0.3042, 0.0403, 0.0398, 0.0423, 0.0335]
r2_values = [0.8660, 0.9976, 0.9977, 0.9974, 0.9984]

fig, ax = plt.subplots(1, 2, figsize=(14, 6))

ax[0].bar(model_names, rmse_values, color='skyblue')
ax[0].set_title("RMSE - Błąd średniokwadratowy (niżej = lepiej)")
ax[0].set_ylabel("RMSE")
ax[0].tick_params(axis='x', rotation=20)

ax[1].bar(model_names, r2_values, color='mediumseagreen')
ax[1].set_title("R² - Dopasowanie modelu (bliżej 1 = lepiej)")
ax[1].set_ylabel("R²")
ax[1].tick_params(axis='x', rotation=20)

plt.suptitle(" Porównanie modeli regresji", fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
import joblib
joblib.dump(best_xgb_model, "xgb_best_model.pkl")

In [None]:
feature_columns = X_train_encoded.columns.tolist()
joblib.dump(feature_columns, "feature_columns.pkl")

In [None]:
df_encoded_clean.to_csv("prawiekoniec_clean.csv", index=False)

In [None]:
import joblib

feature_columns = joblib.load("feature_columns.pkl")

print(" Używane cechy przez model:")
for col in feature_columns:
    print("-", col)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(14, 10))
corr_matrix = df.corr(numeric_only=True)

sns.heatmap(corr_matrix[['LOG_PRICE']].sort_values(by='LOG_PRICE', ascending=False), 
            annot=True, cmap='coolwarm', cbar=True, linewidths=0.5)
plt.title(" Korelacja cech z LOG_PRICE")
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("gotowe_dane_do_modelu.csv")

plt.figure(figsize=(14, 5))

plt.subplot(1, 2, 1)
sns.histplot(df["PRICE"], bins=50, kde=True, color="skyblue")
plt.title("Rozkład ceny (PRICE)")
plt.xlabel("PRICE")
plt.ylabel("Liczba")

plt.subplot(1, 2, 2)
sns.histplot(df["LOG_PRICE"], bins=50, kde=True, color="salmon")
plt.title("Rozkład logarytmicznej ceny (LOG_PRICE)")
plt.xlabel("LOG_PRICE")

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.boxplot(x="LUXURY_HOME", y="PRICE", data=df)
plt.title("Cena vs LUXURY_HOME")
plt.xlabel("Czy luksusowy dom (1 = Tak, 0 = Nie)")
plt.ylabel("Cena (USD)")
plt.yscale("log")
plt.grid(True)
plt.show()

In [None]:
import plotly.express as px

fig = px.scatter_mapbox(
    df,
    lat="LATITUDE",
    lon="LONGITUDE",
    color="LOG_PRICE",
    size="PRICE",
    hover_name="TYPE",
    hover_data=["PRICE", "BEDS", "BATH", "BOROUGH", "LOCATION_CATEGORY"],
    zoom=10,
    mapbox_style="carto-positron",
    title="Lokalizacja nieruchomości z ceną"
)
fig.show()

In [None]:
import xgboost as xgb
import joblib
import matplotlib.pyplot as plt

model = joblib.load("xgb_best_model.pkl")
feature_columns = joblib.load("feature_columns.pkl")

importances = model.feature_importances_

feat_df = pd.DataFrame({
    "Feature": feature_columns,
    "Importance": importances
}).sort_values(by="Importance", ascending=False).head(15)

plt.figure(figsize=(10, 6))
sns.barplot(x="Importance", y="Feature", data=feat_df, palette="viridis")
plt.title("Najważniejsze cechy wg XGBoost")
plt.xlabel("Ważność cechy")
plt.ylabel("Cecha")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib

model = joblib.load("xgb_best_model.pkl")
feature_columns = joblib.load("feature_columns.pkl")

user_input = {
    "BEDS": 2,
    "BATH": 1,
    "PROPERTYSQFT": 65,
    "LATITUDE": 40.71,
    "LONGITUDE": -74.00,
    "LUXURY_HOME": 0,
    "PRICE_PER_SQFT": 750,
    "BOROUGH_Manhattan": 1,
    "LOCATION_CATEGORY_NorthEast": 1,
    "TYPE_Condo for sale": 1,
}

input_df = pd.DataFrame([user_input])
input_df = input_df.reindex(columns=feature_columns, fill_value=0)

log_price_pred = model.predict(input_df)[0]
price_pred = np.exp(log_price_pred)

print(f"Log price: {log_price_pred:.4f}")
print(f"Cena prognozowana: ${price_pred:,.2f}")

In [None]:
df = pd.read_csv("gotowe_dane_do_modelu.csv")

plt.figure(figsize=(10,6))

plt.scatter(df["PROPERTYSQFT"], df["LOG_PRICE"], alpha=0.3, label="Dane historyczne")

plt.scatter(user_input["PROPERTYSQFT"], log_price_pred, color="red", s=100, label="Predykcja API")

plt.title("Metraż vs Logarytmiczna cena")
plt.xlabel("PROPERTYSQFT (m²)")
plt.ylabel("LOG_PRICE")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()