In [15]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
data=train_df.head(30)
print(data)

selected_features = ['GrLivArea', 'BedroomAbvGr', 'FullBath', 'GarageCars', 'TotalBsmtSF', 'YearBuilt']
target = 'SalePrice'

for col in selected_features:
    train_df[col] = train_df[col].fillna(train_df[col].median())
    test_df[col] = test_df[col].fillna(train_df[col].median())

train_df = train_df.dropna(subset=[target])

X = train_df[selected_features]
y = np.log1p(train_df[target])

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(test_df[selected_features])

model = LinearRegression()
model.fit(X_train_scaled, y_train)

y_pred_log = model.predict(X_valid_scaled)
y_pred = np.expm1(y_pred_log)
y_valid_actual = np.expm1(y_valid)

rmse = np.sqrt(mean_squared_error(y_valid_actual, y_pred))
print("Validation RMSE:", rmse)

test_predictions_log = model.predict(X_test_scaled)
test_predictions = np.expm1(test_predictions_log)

output = pd.DataFrame({
    'Id': test_df['Id'],
    'PredictedSalePrice': test_predictions
})
output.to_csv('predictions.csv', index=False)
print("Predictions saved to predictions.csv")


    Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0    1          60       RL         65.0     8450   Pave   NaN      Reg   
1    2          20       RL         80.0     9600   Pave   NaN      Reg   
2    3          60       RL         68.0    11250   Pave   NaN      IR1   
3    4          70       RL         60.0     9550   Pave   NaN      IR1   
4    5          60       RL         84.0    14260   Pave   NaN      IR1   
5    6          50       RL         85.0    14115   Pave   NaN      IR1   
6    7          20       RL         75.0    10084   Pave   NaN      Reg   
7    8          60       RL          NaN    10382   Pave   NaN      IR1   
8    9          50       RM         51.0     6120   Pave   NaN      Reg   
9   10         190       RL         50.0     7420   Pave   NaN      Reg   
10  11          20       RL         70.0    11200   Pave   NaN      Reg   
11  12          60       RL         85.0    11924   Pave   NaN      IR1   
12  13          20       

In [17]:
pred = pd.read_csv('predictions.csv')
print(pred.head(50))

      Id  PredictedSalePrice
0   1461       116389.953251
1   1462       139712.430042
2   1463       193635.415523
3   1464       192609.122980
4   1465       180811.931942
5   1466       187933.292792
6   1467       168110.447113
7   1468       179765.287213
8   1469       182952.338358
9   1470       135575.974579
10  1471       192969.446542
11  1472       117936.299531
12  1473       120263.687185
13  1474       165618.773922
14  1475       119436.094213
15  1476       323865.532932
16  1477       243129.392574
17  1478       258531.774300
18  1479       275018.674813
19  1480       431166.214574
20  1481       309618.393326
21  1482       199548.980511
22  1483       189720.711787
23  1484       178374.529237
24  1485       168023.224067
25  1486       203987.042303
26  1487       320532.816003
27  1488       251489.936511
28  1489       187128.190179
29  1490       208737.983302
30  1491       198247.144679
31  1492       102638.565646
32  1493       175941.111836
33  1494      