<a href="https://colab.research.google.com/github/HyVeel/eksploracja-danych/blob/main/zaj8/reg_cat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [66]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

In [67]:
house = pd.read_csv("house_sales.csv", sep="\t")
house.head(3)

Unnamed: 0,DocumentDate,SalePrice,PropertyID,PropertyType,ym,zhvi_px,zhvi_idx,AdjSalePrice,NbrLivingUnits,SqFtLot,...,Bathrooms,Bedrooms,BldgGrade,YrBuilt,YrRenovated,TrafficNoise,LandVal,ImpsVal,ZipCode,NewConstruction
1,2014-09-16,280000,1000102,Multiplex,2014-09-01,405100,0.930836,300805.0,2,9373,...,3.0,6,7,1991,0,0,70000,229000,98002,False
2,2006-06-16,1000000,1200013,Single Family,2006-06-01,404400,0.929228,1076162.0,1,20156,...,3.75,4,10,2005,0,0,203000,590000,98166,True
3,2007-01-29,745000,1200019,Single Family,2007-01-01,425600,0.977941,761805.0,1,26036,...,1.75,4,8,1947,0,0,183000,275000,98166,False


In [68]:
predictors_num = ["SqFtTotLiving", "BldgGrade"]
categorical = ["PropertyType"]
outcome = "AdjSalePrice"

columns_needed = predictors_num + categorical + [outcome]
house_clean = house[columns_needed].dropna().reset_index(drop=True)

print(f"Kolumny: {columns_needed}")
print(f"Dane: {len(house_clean)} wierszy")
house_clean.head(3)

Kolumny: ['SqFtTotLiving', 'BldgGrade', 'PropertyType', 'AdjSalePrice']
Dane: 22687 wierszy


Unnamed: 0,SqFtTotLiving,BldgGrade,PropertyType,AdjSalePrice
0,2400,7,Multiplex,300805.0
1,3764,10,Single Family,1076162.0
2,2060,8,Single Family,761805.0


In [69]:
# ========================================
# KROK 1: STANDARYZUJ ZMIENNE NUMERYCZNE
# ========================================

scaler = StandardScaler()
X_num = house_clean[predictors_num].copy()
X_num_scaled = scaler.fit_transform(X_num)
X_num_scaled = pd.DataFrame(X_num_scaled, columns=predictors_num, index=house_clean.index)

print("Zmienne numeryczne (standaryzowane):")
print(X_num_scaled.describe().round(3))
X_num_scaled.head(3)

Zmienne numeryczne (standaryzowane):
       SqFtTotLiving  BldgGrade
count      22687.000  22687.000
mean          -0.000      0.000
std            1.000      1.000
min           -1.872     -3.965
25%           -0.723     -0.577
50%           -0.186     -0.577
75%            0.503      0.270
max            9.478      4.506


Unnamed: 0,SqFtTotLiving,BldgGrade
0,0.350035,-0.576873
1,1.84283,1.964557
2,-0.02207,0.27027


In [70]:
# ========================================
# KROK 2: DUMMY VARIABLES
# ========================================

X_cat = pd.get_dummies(house_clean[categorical], drop_first=True) # Multiplex jest pierwszą kategorią - usuwamy ją
print("Zmienne kategoryczne:")
print(X_cat.head(3))

Zmienne kategoryczne:
   PropertyType_Single Family  PropertyType_Townhouse
0                       False                   False
1                        True                   False
2                        True                   False


In [71]:
# ========================================
# KROK 3: POŁĄCZ
# ========================================

X = pd.concat([X_num_scaled, X_cat], axis=1)
y = house_clean[outcome]

print("Zmienne niezależne:")
print(X.head(3))
print("\nZmienna zależna:")
print(y.head(3))

Zmienne niezależne:
   SqFtTotLiving  BldgGrade  PropertyType_Single Family  \
0       0.350035  -0.576873                       False   
1       1.842830   1.964557                        True   
2      -0.022070   0.270270                        True   

   PropertyType_Townhouse  
0                   False  
1                   False  
2                   False  

Zmienna zależna:
0     300805.0
1    1076162.0
2     761805.0
Name: AdjSalePrice, dtype: float64


In [72]:
# ========================================
# KROK 4: TRENUJ MODEL
# ========================================

model = LinearRegression()
model.fit(X, y)

y_pred = model.predict(X)
rmse = np.sqrt(mean_squared_error(y, y_pred))
r2 = r2_score(y, y_pred)

print("Model liniowy:")
print(f"RMSE: ${rmse:.0f}")
print(f"R²: {r2:.4f}")

Model liniowy:
RMSE: $263718
R²: 0.5318


In [73]:
# ========================================
# KROK 5: WYNIKI
# ========================================

print("="*60)
print("MODEL REGRESJI")
print("="*60)
print(f"\nRMSE: ${rmse:.0f}")
print(f"R²: {r2:.4f}")
print()

print("WSPÓŁCZYNNIKI (zmienne standaryzowane):")
print(f"b₀ (Intercept): {model.intercept_:.0f}")
print(f"  → Bazowa cena (średnie wartości zmiennych) = ${model.intercept_:.0f}")
print()

for i, feature in enumerate(predictors_num):
    coef = model.coef_[i]
    std_dev = scaler.scale_[i]

    print(f"b{i+1} ({feature}): {coef:.0f}")
    print(f"\t→ Wzrost o 1 odch.std ({std_dev:.2f} {feature}) = +${coef:.0f}") # 1 odch.std = std_dev oryginalnej jednostki kolumny

print()

for i, feature in enumerate(X_cat.columns):
    idx = len(predictors_num) + i
    coef = model.coef_[idx]

    print(f"b{idx + 1} ({feature}): {coef:.0f}")
    print(f"\t→ {feature} vs Multiplex: ${coef:.0f}")

MODEL REGRESJI

RMSE: $263718
R²: 0.5318

WSPÓŁCZYNNIKI (zmienne standaryzowane):
b₀ (Intercept): 603184
  → Bazowa cena (średnie wartości zmiennych) = $603184

b1 (SqFtTotLiving): 161822
	→ Wzrost o 1 odch.std (913.72 SqFtTotLiving) = +$161822
b2 (BldgGrade): 136058
	→ Wzrost o 1 odch.std (1.18 BldgGrade) = +$136058

b3 (PropertyType_Single Family): -36612
	→ PropertyType_Single Family vs Multiplex: $-36612
b4 (PropertyType_Townhouse): -59886
	→ PropertyType_Townhouse vs Multiplex: $-59886


In [74]:
# ========================================
# KROK 6: INTERPRETACJA
# ========================================

print("="*60)
print("INTERPRETACJA (w jednostkach oryginalnych)")
print("="*60)
print()

coef_sqft = model.coef_[0] / scaler.scale_[0]
coef_grade = model.coef_[1] / scaler.scale_[1]

print(f"Każdy sqft więcej → +${coef_sqft:.0f}")
print(f"Każdy grade wyżej → +${coef_grade:.0f}")

INTERPRETACJA (w jednostkach oryginalnych)

Każdy sqft więcej → +$177
Każdy grade wyżej → +$115261


In [75]:
# ========================================
# KROK 7: PREDYKCJA
# ========================================

print("="*60)
print("PREDYKCJA (średnie wartości)")
print("="*60)
print()

# Dom Multiplex - średnie cechy (standaryzacja = 0)
new_house_1 = pd.DataFrame({
    'SqFtTotLiving': [0],  # 0 = średnia (standaryzowana)
    'BldgGrade': [0]
})

for col in X_cat.columns:
    new_house_1[col] = 0

price_1 = model.predict(new_house_1)[0] # zwraca listę o jednym elemencie
print(f"Multiplex (średnie cechy): ${price_1:.0f}")

# Inne PropertyType
for cat in X_cat.columns:
    new_house = pd.DataFrame({
        'SqFtTotLiving': [0],
        'BldgGrade': [0]
    })

    for col in X_cat.columns:
        new_house[col] = 0 # 0 dla wszystkich

    new_house[cat] = 1 # zmiana na 1 tylko wybranego typu

    price_cat = model.predict(new_house)[0]
    diff = price_cat - price_1

    print(f"{cat} (średnie cechy): ${price_cat:.0f} (różnica: ${diff:.0f})")

PREDYKCJA (średnie wartości)

Multiplex (średnie cechy): $603184
PropertyType_Single Family (średnie cechy): $566573 (różnica: $-36612)
PropertyType_Townhouse (średnie cechy): $543298 (różnica: $-59886)
