In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler


In [6]:
from google.colab import files
import pandas as pd

uploaded = files.upload()
house = pd.read_csv(next(iter(uploaded)))
print(house.head(3))

Saving AMES_housing_Price.csv to AMES_housing_Price (5).csv
   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD         Normal     208500  
1   2007        WD         Normal     181500  
2   2008        WD         Normal     223500  

[3 rows x 81 columns]


In [None]:
predictors_num = ['GrLivArea', 'OverallQual']
categorical = ['BldgType']
outcome = 'SalePrice'

columns_needed = predictors_num + categorical + [outcome]
house_clean = house[columns_needed].dropna().reset_index(drop=True)

print(f"Dane: {len(house_clean)} wierszy")

Dane: 1460 wierszy


In [None]:
# ========================================
# KROK 1: STANDARYZUJ ZMIENNE NUMERYCZNE
# ========================================
scaler = StandardScaler()
X_num = house_clean[predictors_num].copy()
X_num_scaled = scaler.fit_transform(X_num)
X_num_scaled = pd.DataFrame(X_num_scaled, columns=predictors_num, index=house_clean.index)

print("Zmienne numeryczne (standaryzowane):")
print(X_num_scaled.describe().round(3))

Zmienne numeryczne (standaryzowane):
       GrLivArea  OverallQual
count   1460.000     1460.000
mean      -0.000        0.000
std        1.000        1.000
min       -2.249       -3.688
25%       -0.735       -0.795
50%       -0.098       -0.072
75%        0.497        0.651
max        7.856        2.821


In [None]:
# ========================================
# KROK 2: DUMMY VARIABLES
# ========================================
X_cat = pd.get_dummies(house_clean['BldgType'], drop_first=True)

In [None]:
# ========================================
# KROK 3: POŁĄCZ
# ========================================
X = pd.concat([X_num_scaled, X_cat], axis=1)
y = house_clean[outcome]

In [None]:
# ========================================
# KROK 4: TRENUJ MODEL
# ========================================
model = LinearRegression()
model.fit(X, y)

y_pred = model.predict(X)
rmse = np.sqrt(mean_squared_error(y, y_pred))
r2 = r2_score(y, y_pred)

In [None]:
# ========================================
# KROK 5: WYNIKI
# ========================================
print("="*60)
print("MODEL REGRESJI")
print("="*60)
print(f"\nRMSE: ${rmse:,.0f}")
print(f"R²: {r2:.4f}")
print()

print("WSPÓŁCZYNNIKI (zmienne standaryzowane):")
print(f"b₀ (Intercept): ${model.intercept_:,.0f}")
print("  → Bazowa cena (średnie wartości zmiennych)")
print()

for i, feature in enumerate(predictors_num):
    std_dev = scaler.scale_[i]
    print(f"b{i+1} ({feature}): ${model.coef_[i]:,.0f}")
    print(f"  → Wzrost o 1 odch.std ({std_dev:,.0f} {feature}) = +${model.coef_[i]:,.0f}")

print()

for i, feature in enumerate(X_cat.columns):
    idx = len(predictors_num) + i
    print(f"b{len(predictors_num)+i+1} ({feature}): ${model.coef_[idx]:,.0f}")
    print(f"  → {feature} vs baseline (1Fam): ${model.coef_[idx]:,.0f}")

print()

MODEL REGRESJI

RMSE: $41,850
R²: 0.7223

WSPÓŁCZYNNIKI (zmienne standaryzowane):
b₀ (Intercept): $183,729
  → Bazowa cena (średnie wartości zmiennych)

b1 (GrLivArea): $28,713
  → Wzrost o 1 odch.std (525 GrLivArea) = +$28,713
b2 (OverallQual): $45,325
  → Wzrost o 1 odch.std (1 OverallQual) = +$45,325

b3 (2fmCon): $-19,655
  → 2fmCon vs baseline (1Fam): $-19,655
b4 (Duplex): $-15,721
  → Duplex vs baseline (1Fam): $-15,721
b5 (Twnhs): $-33,782
  → Twnhs vs baseline (1Fam): $-33,782
b6 (TwnhsE): $-10,704
  → TwnhsE vs baseline (1Fam): $-10,704



In [None]:
# ========================================
# KROK 6: INTERPRETACJA
# ========================================
print("="*60)
print("INTERPRETACJA (w jednostkach oryginalnych)")
print("="*60)
print()

coef_sqft = model.coef_[0] / scaler.scale_[0]
coef_grade = model.coef_[1] / scaler.scale_[1]

print(f"Każdy sqft GrLivArea więcej → +${coef_sqft:,.2f}")
print(f"Każdy grade OverralQual wyżej → +${coef_grade:,.0f}")
print()


INTERPRETACJA (w jednostkach oryginalnych)

Każdy sqft GrLivArea więcej → +$54.66
Każdy grade OverralQual wyżej → +$32,784



In [None]:
# ========================================
# KROK 7: PREDYKCJA
# ========================================
print("="*60)
print("PREDYKCJA (średnie wartości)")
print("="*60)
print()

# Dom Multiplex - średnie cechy (standaryzacja = 0)
new_house_1 = pd.DataFrame({
    'GrLivArea': [0],  # 0 = średnia (standaryzowana)
    'OverallQual': [0]
})
for col in X_cat.columns:
    new_house_1[col] = 0

price_1 = model.predict(new_house_1)[0]
print(f"Multiplex (średnie cechy): ${price_1:,.0f}")

# Inne PropertyType
for cat in X_cat.columns:
    new_house = pd.DataFrame({
        'GrLivArea': [0],
        'OverallQual': [0]
    })
    for col in X_cat.columns:
        new_house[col] = 0
    new_house[cat] = 1

    price_cat = model.predict(new_house)[0]
    diff = price_cat - price_1
    print(f"{cat} (średnie cechy): ${price_cat:,.0f} (różnica: ${diff:,.0f})")

PREDYKCJA (średnie wartości)

Multiplex (średnie cechy): $183,729
2fmCon (średnie cechy): $164,074 (różnica: $-19,655)
Duplex (średnie cechy): $168,008 (różnica: $-15,721)
Twnhs (średnie cechy): $149,947 (różnica: $-33,782)
TwnhsE (średnie cechy): $173,025 (różnica: $-10,704)
