In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

train_df = pd.read_csv('dataset/train.csv')
test_df = pd.read_csv('dataset/test.csv')
submission_df = pd.read_csv('dataset/sample_submission.csv')


In [3]:
train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
test_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [None]:


features = ['MSSubClass', 'Street']
target = 'SalePrice'

X = train_df[features]
y = train_df[target]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)



ValueError: could not convert string to float: 'Pave'

In [None]:

model = LinearRegression()
model.fit(X_train, y_train)

val_preds = model.predict(X_val)
mse = mean_squared_error(y_val, val_preds)
print(f"Validation MSE: {mse:.2f}")
print(f"Model Coefficients: {model.coef_}")
print(f"Intercept: {model.intercept_}")

X_test = test_df[features]
test_preds = model.predict(X_test)

submission_df[target] = test_preds
submission_df.to_csv('my_submission.csv', index=False)
print("✅ Submission saved to 'my_submission.csv'")


In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

train_df = pd.read_csv('dataset/train.csv')
test_df = pd.read_csv('dataset/test.csv')
submission_df = pd.read_csv('dataset/sample_submission.csv')

train_df.columns = train_df.columns.str.strip()
test_df.columns = test_df.columns.str.strip()

target = 'SalePrice'  
X = train_df.drop(columns=[target])
y = train_df[target]

combined = pd.concat([X, test_df], keys=['train', 'test'])

combined_encoded = pd.get_dummies(combined)
combined_encoded = combined_encoded.fillna(combined_encoded.mean()) 

X_encoded = combined_encoded.xs('train')
X_test_encoded = combined_encoded.xs('test')

X_test_encoded = X_test_encoded.reindex(columns=X_encoded.columns, fill_value=0)

X_train, X_val, y_train, y_val = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

val_preds = model.predict(X_val)
mse = mean_squared_error(y_val, val_preds)
print(f"Validation MSE: {mse:.2f}")

test_preds = model.predict(X_test_encoded)

submission_df[target] = test_preds
submission_df.to_csv('my_submission.csv', index=False)
print("✅ Submission saved as 'my_submission.csv'")


Validation MSE: 4272816756.09
✅ Submission saved as 'my_submission.csv'


In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


In [None]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
submission_df = pd.read_csv("sample_submission.csv")


In [None]:
X = train_df.drop(["SalePrice", "Id"], axis=1)
y = train_df["SalePrice"]
test_ids = test_df["Id"]
X_test = test_df.drop(["Id"], axis=1)


In [None]:

numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns
categorical_cols = X.select_dtypes(include=["object"]).columns


In [None]:

numerical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])


In [None]:
categorical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])


In [None]:
preprocessor = ColumnTransformer(transformers=[
    ("num", numerical_pipeline, numerical_cols),
    ("cat", categorical_pipeline, categorical_cols)
])


In [None]:
model_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor(n_estimators=100, random_state=0))
])


In [None]:

model_pipeline.fit(X, y)


In [None]:
preds = model_pipeline.predict(X_test)

In [None]:
submission_df["SalePrice"] = preds
submission_df.to_csv("submission.csv", index=False)
print("Submission file 'submission.csv' created successfully.")


Submission file 'submission.csv' created successfully.
