In [219]:
#Part3. Smarter binarization: Only binarizing categorical features
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler,StandardScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_log_error
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
import numpy as np

In [73]:

train_data = pd.read_csv('my_train.csv')
dev_data = pd.read_csv('my_dev.csv')

categorical_columns = train_data.select_dtypes(include=['object', 'category']).columns
numerical_columns = train_data.select_dtypes(include=['number']).drop(columns=['Id','SalePrice']).columns

# Impute missing values in categorical columns
cat_imputer = SimpleImputer(strategy='most_frequent', fill_value='Missing')
imputed_categorical_data = cat_imputer.fit_transform(train_data[categorical_cols])

# Convert the imputed data to a DataFrame
imputed_categorical_df = pd.DataFrame(imputed_categorical_data, columns=categorical_cols)

# Binarize categorical columns
encoder = OneHotEncoder(handle_unknown='ignore')
binarized_categorical_data = encoder.fit_transform(imputed_categorical_df)

# Impute missing values in numerical columns
num_imputer = SimpleImputer(strategy='mean')
for col in numerical_cols:
     imputed_numerical_data = num_imputer.fit_transform(train_data[[col]])
  
# Combine categorical and numerical data
X_train = np.hstack((binarized_categorical_data.toarray(), imputed_numerical_data))

total_features = X_train.shape[1]
print("Total features after smarter binarization:", total_features)



('Total features after smarter binarization:', 252L)


In [220]:

train_data = pd.read_csv('my_train.csv').astype(str)
dev_data = pd.read_csv('my_dev.csv').astype(str)

categorical_cols = train_data.select_dtypes(include=['object']).columns
numerical_cols = train_data.select_dtypes(exclude=['object']).columns

train_data[numerical_cols] = train_data[numerical_cols].astype(float).fillna(train_data[numerical_cols].astype(float).mean())
train_data[categorical_cols] = train_data[categorical_cols].fillna('missing')

dev_data[numerical_cols] = dev_data[numerical_cols].astype(float).fillna(train_data[numerical_cols].astype(float).mean())
dev_data[categorical_cols] = dev_data[categorical_cols].fillna('missing')

# One-hot encode only the categorical features
encoder = OneHotEncoder(handle_unknown='ignore')
X_train_cat = encoder.fit_transform(train_data[categorical_cols])
X_dev_cat = encoder.transform(dev_data[categorical_cols])

# Combine one-hot encoded categorical features with the original numerical features
X_train = np.hstack((X_train_cat.toarray(), train_data[numerical_cols].values.astype(float)))
X_dev = np.hstack((X_dev_cat.toarray(), dev_data[numerical_cols].values.astype(float)))

# Transform target variable (SalePrice) using logarithm
y_train = np.log(train_data['SalePrice'].astype(float))
y_dev = np.log(dev_data['SalePrice'].astype(float))

model = LinearRegression()
model.fit(X_train, y_train)

dev_predictions = model.predict(X_dev)
rmsle_dev = np.sqrt(mean_squared_log_error(np.exp(y_dev), np.exp(dev_predictions)))
print("new dev error:", rmsle_dev)



('new dev error:', 0.1424433144859072)


In [221]:


coefficients = model.coef_

ohe_feature_names = encoder.get_feature_names(input_features=categorical_cols)

all_feature_names = np.concatenate((ohe_feature_names, numerical_cols))

coef_df = pd.DataFrame({'Feature': all_feature_names, 'Coefficient': coefficients})

top_positive_features = coef_df.sort_values(by='Coefficient', ascending=False).head(10)
top_negative_features = coef_df.sort_values(by='Coefficient', ascending=True).head(10)

print("Top 10 most positive features:")
print(top_positive_features)

print("\nTop 10 most negative features:")
print(top_negative_features)


Top 10 most positive features:
      Coefficient               Feature
7216     0.126330            FullBath_3
2518     0.124723         OverallQual_9
2476     0.115180  Neighborhood_StoneBr
2517     0.099363         OverallQual_8
6131     0.084518          2ndFlrSF_472
7236     0.084471       TotRmsAbvGrd_10
2460     0.082575  Neighborhood_Crawfor
2469     0.082453  Neighborhood_NoRidge
7376     0.077604          GarageCars_3
2712     0.077213      RoofMatl_WdShngl

Top 10 most negative features:
      Coefficient            Feature
1329    -0.167008   MSZoning_C (all)
2512    -0.111988      OverallQual_3
7191    -0.105522      GrLivArea_968
2521    -0.099578      OverallCond_3
8326    -0.088475  EnclosedPorch_236
7242    -0.087289     TotRmsAbvGrd_4
7374    -0.084055       GarageCars_1
3759    -0.075352     BsmtFinSF2_311
2221    -0.075352       LotArea_8281
9110    -0.075352    SalePrice_62383


In [163]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error

train_df = pd.read_csv('my_train.csv')
dev_df = pd.read_csv('my_dev.csv')

# Handling missing values: Replace missing values in numerical features with their mean
numerical_features_train = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
for feature in numerical_features_train:
    mean_value = train_df[feature].mean()
    train_df[feature].fillna(mean_value, inplace=True)

numerical_features_dev = dev_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
for feature in numerical_features_dev:
    mean_value = dev_df[feature].mean()
    dev_df[feature].fillna(mean_value, inplace=True)

# One-hot encode categorical features
categorical_features = train_df.select_dtypes(include=['object']).columns.tolist()
train_df_encoded = pd.get_dummies(train_df, columns=categorical_features)
dev_df_encoded = pd.get_dummies(dev_df, columns=categorical_features)

# Align the columns of the development set with those of the training set
final_columns = list(set(train_df_encoded.columns) & set(dev_df_encoded.columns))
dev_df_encoded = dev_df_encoded[final_columns]

# Add missing columns in the development dataset with default value of 0
missing_cols = set(final_columns) - set(dev_df_encoded.columns)
for col in missing_cols:
    dev_df_encoded[col] = 0

# Ensure the order of columns is consistent across both datasets
train_df_encoded = train_df_encoded[final_columns]
dev_df_encoded = dev_df_encoded[final_columns]

# Separate features and target variable
X_train = train_df_encoded.drop(['SalePrice'], axis=1)
y_train = train_df_encoded['SalePrice']
X_dev = dev_df_encoded.drop(['SalePrice'], axis=1)
y_dev = dev_df_encoded['SalePrice']

model = LinearRegression()
model.fit(X_train, y_train)

y_dev_pred = model.predict(X_dev)
y_dev_pred = np.maximum(0, y_dev_pred)  # Ensuring predictions are non-negative

rmsle = np.sqrt(mean_squared_log_error(y_dev, y_dev_pred))
print("new dev error: ", rmsle)


('new dev error: ', 0.16692642333015212)


In [144]:

number_of_features = len(X_train.columns)

print("total number of features:", number_of_features)



('total number of features:', 224)


In [159]:
coefficients = model.coef_

feature_coefficients = zip(X_train.columns, coefficients)

sorted_feature_coefficients = sorted(feature_coefficients, key=lambda x: x[1], reverse=True)

top_positive_features = sorted_feature_coefficients[:10]
top_negative_features = sorted_feature_coefficients[-10:]

print("Top 10 Most Positive Features:")
for feature, coeff in top_10_positive_features:
    print(feature,format(coeff, '.2f'))

print("\nTop 10 Most Negative Features:")
for feature, coeff in top_10_negative_features:
    print(feature,coeff)


Top 10 Most Positive Features:
('RoofMatl_WdShngl', '144827.63')
('Condition2_Artery', '69761.48')
('ExterCond_Ex', '62696.14')
('RoofMatl_CompShg', '49069.11')
('Utilities_AllPub', '48518.35')
('Exterior2nd_ImStucc', '44205.54')
('BsmtQual_Ex', '33893.71')
('BsmtExposure_Gd', '30699.31')
('Condition2_Norm', '28714.75')
('HouseStyle_1Story', '22876.59')

Top 10 Most Negative Features:
('Neighborhood_Sawyer', '-48742.75')
('Neighborhood_Gilbert', '-48899.67')
('Neighborhood_NWAmes', '-49106.39')
('Neighborhood_ClearCr', '-49580.20')
('RoofStyle_Gambrel', '-49672.15')
('Neighborhood_NAmes', '-50094.46')
('Neighborhood_IDOTRR', '-55675.36')
('Neighborhood_OldTown', '-57657.56')
('Neighborhood_Mitchel', '-59590.80')
('Neighborhood_Edwards', '-61843.18')


In [213]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import numpy as np



train_df = pd.read_csv('my_train.csv')
dev_df = pd.read_csv('my_dev.csv')
test_df = pd.read_csv('test.csv')

combined_df = pd.concat([train_df, dev_df], ignore_index=True)

X = combined_df.drop('SalePrice', axis=1)
y = combined_df['SalePrice']

categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(exclude=['object']).columns

numerical_imputer = SimpleImputer(strategy='mean')
numerical_transformer = Pipeline(steps=[
    ('imputer', numerical_imputer)
])

categorical_imputer = SimpleImputer(strategy='most_frequent', fill_value='Missing')
categorical_transformer = Pipeline(steps=[
    ('imputer', categorical_imputer),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

model = LinearRegression()

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])
pipeline.fit(X, y)

X_test = test_df
test_preds = pipeline.predict(X_test)

print(test_preds)

submission_df = pd.DataFrame({
    'Id': test_df['Id'],  
    'SalePrice': test_preds  
})


submission_df.to_csv('submission_p3.csv', index=False)

submission_df.head()

[113661.21177995 152587.60044628 188085.25694171 ... 181621.63722197
 109974.13410105 223994.16789228]


Unnamed: 0,Id,SalePrice
0,1461,113661.21178
1,1462,152587.600446
2,1463,188085.256942
3,1464,199011.971488
4,1465,203652.973986
