In [1]:
#Part4: Experimentation

In [55]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_log_error
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, train_test_split
import numpy as np

In [5]:
#Part2. Naive data processing: binarizing all fields

train_data = pd.read_csv('my_train.csv').astype(str)
dev_data = pd.read_csv('my_dev.csv').astype(str)

encoder = OneHotEncoder(handle_unknown='ignore')
X_train = encoder.fit_transform(train_data.drop(['Id', 'SalePrice'], axis=1))
y_train = np.log(train_data['SalePrice'].astype(float))

X_dev = encoder.transform(dev_data.drop(['Id', 'SalePrice'], axis=1))
y_dev = np.log(dev_data['SalePrice'].astype(float))

model = LinearRegression()
model.fit(X_train, y_train)

predictions = model.predict(X_dev)
rmsle = np.sqrt(mean_squared_log_error(np.exp(y_dev), np.exp(predictions)))
print("RMSLE on the development set:", rmsle)


('RMSLE on the development set:', 0.1520164459135018)


In [20]:
#Part3.  Smarter binarization: Only binarizing categorical features

train_data = pd.read_csv('my_train.csv').astype(str)
dev_data = pd.read_csv('my_dev.csv').astype(str)

categorical_cols = train_data.select_dtypes(include=['object']).columns
numerical_cols = train_data.select_dtypes(exclude=['object']).columns

# Handle missing values (you can choose a suitable imputation strategy)
train_data[numerical_cols] = train_data[numerical_cols].fillna(train_data[numerical_cols].mean())
train_data[categorical_cols] = train_data[categorical_cols].fillna('missing')

dev_data[numerical_cols] = dev_data[numerical_cols].fillna(train_data[numerical_cols].mean())
dev_data[categorical_cols] = dev_data[categorical_cols].fillna('missing')

# One-hot encode only the categorical features
encoder = OneHotEncoder(handle_unknown='ignore')
X_train_cat = encoder.fit_transform(train_data[categorical_cols])
X_dev_cat = encoder.transform(dev_data[categorical_cols])

# Combine one-hot encoded categorical features with the original numerical features
X_train = np.hstack((X_train_cat.toarray(), train_data[numerical_cols].values))
X_dev = np.hstack((X_dev_cat.toarray(), dev_data[numerical_cols].values))

y_train = np.log(train_data['SalePrice'].astype(float))
y_dev = np.log(dev_data['SalePrice'].astype(float))

model = LinearRegression()
model.fit(X_train, y_train)

predictions = model.predict(X_dev)
rmsle = np.sqrt(mean_squared_log_error(np.exp(y_dev), np.exp(predictions)))
print("New dev error:", rmsle)

('New dev error:', 0.1424433144859072)


In [21]:
#Part4-1:linear regression
alphas = np.logspace(-2, 2, 100)  

best_alpha = None
best_rmsle = float('inf')

# Tuning alpha
for alpha in alphas:
    model = Ridge(alpha=alpha)
    model.fit(X_train, y_train)
    predictions = model.predict(X_dev)
    rmsle = np.sqrt(mean_squared_log_error(np.exp(y_dev), np.exp(predictions)))
    
    if rmsle < best_rmsle:
        best_rmsle = rmsle
        best_alpha = alpha

# Retrain with best alpha
model = Ridge(alpha=best_alpha)
model.fit(X_train, y_train)
predictions = model.predict(X_dev)
num1_rmsle = np.sqrt(mean_squared_log_error(np.exp(y_dev), np.exp(predictions)))

print("Best alpha:", best_alpha)
print("4-1 RMSLE on the development set:", num1_rmsle)


('Best alpha:', 6.135907273413176)
('4-1 RMSLE on the development set:', 0.14031919940125231)


In [28]:
#Part4-2: non-linear regression with PolynomialFeatures

train_data = pd.read_csv('my_train.csv')
dev_data = pd.read_csv('my_dev.csv')

categorical_cols = train_data.select_dtypes(include=['object']).columns
numerical_cols = train_data.select_dtypes(exclude=['object']).columns

# Handle missing values
train_data[numerical_cols] = train_data[numerical_cols].fillna(train_data[numerical_cols].mean())
train_data[categorical_cols] = train_data[categorical_cols].fillna('missing')

dev_data[numerical_cols] = dev_data[numerical_cols].fillna(train_data[numerical_cols].mean())
dev_data[categorical_cols] = dev_data[categorical_cols].fillna('missing')

# One-hot encode categorical features
encoder = OneHotEncoder(handle_unknown='ignore')
X_train_cat = encoder.fit_transform(train_data[categorical_cols])
X_dev_cat = encoder.transform(dev_data[categorical_cols])

# Generate polynomial features for numerical features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(train_data[numerical_cols])
X_dev_poly = poly.transform(dev_data[numerical_cols])

# Combine one-hot encoded categorical features with polynomial numerical features
X_train = np.hstack((X_train_cat.toarray(), X_train_poly))
X_dev = np.hstack((X_dev_cat.toarray(), X_dev_poly))

# target variable
y_train = np.log(train_data['SalePrice'].astype(float))
y_dev = np.log(dev_data['SalePrice'].astype(float))

model = LinearRegression()
model.fit(X_train, y_train)

predictions = model.predict(X_dev)
rmsle_2 = np.sqrt(mean_squared_log_error(np.exp(y_dev), np.exp(predictions)))

print("non-linear regression with PolynomialFeatures: ",rmsle_2)


('non-linear regression with PolynomialFeatures: ', 0.2993164748931549)


In [54]:
#4-4: Ridge regression model with a preprocessing pipeline
train_data = pd.read_csv('my_train.csv')
dev_data = pd.read_csv('my_dev.csv')
test_data = pd.read_csv('test.csv')

# Separate features and target
X_train = train_data.drop(['Id', 'SalePrice'], axis=1)
y_train = np.log(train_data['SalePrice'])  # Log transform the target for normality
X_dev = dev_data.drop(['Id', 'SalePrice'], axis=1)
y_dev = np.log(dev_data['SalePrice'])
X_test = test_data.drop(['Id'], axis=1)

# Identify numerical and categorical columns
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X_train.select_dtypes(include=['object']).columns

# Pipeline for numerical features
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with mean
    ('scaler', StandardScaler()),  # Scale features
    ('poly', PolynomialFeatures(degree=2, include_bias=False))  # Add polynomial features
])

# Pipeline for categorical features
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),  # Handle missing values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One hot encode categorical variables
])

# Full preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_cols),
        ('cat', categorical_pipeline, categorical_cols)
    ])

# Create a Ridge regression model with preprocessor
ridge_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('regressor', Ridge(alpha=1.0))])

ridge_pipeline.fit(X_train, y_train)

dev_predictions = ridge_pipeline.predict(X_dev)
dev_rmsle = np.sqrt(mean_squared_log_error(np.exp(y_dev), np.exp(dev_predictions)))

test_predictions = np.exp(ridge_pipeline.predict(X_test))  # Inverse of log transform

submission_final = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': test_predictions})

submission_final.to_csv('submission_final.csv', index=False)

dev_rmsle, submission_final



(0.1727581847643204,         Id      SalePrice
 0     1461  128633.029473
 1     1462    8689.928732
 2     1463  195572.120419
 3     1464  212989.881972
 4     1465  195708.112526
 5     1466  182178.596630
 6     1467  168941.957295
 7     1468  166005.509156
 8     1469  196550.373525
 9     1470  125819.185801
 10    1471  202819.768557
 11    1472  103506.123919
 12    1473  156805.532156
 13    1474  145160.981194
 14    1475  120800.089309
 15    1476  397496.684628
 16    1477  262854.279940
 17    1478  332177.319005
 18    1479  316146.348014
 19    1480  330147.376839
 20    1481  318048.758481
 21    1482  221713.507288
 22    1483  180630.277712
 23    1484  168980.036541
 24    1485  193194.645806
 25    1486  198370.406773
 26    1487  396530.117477
 27    1488  250368.206033
 28    1489  216055.607666
 29    1490  236953.128322
 ...    ...            ...
 1429  2890   82815.631399
 1430  2891  145618.637742
 1431  2892   41991.918301
 1432  2893   73171.457388
 1433  2