In [1]:
#Part2. Naive data processing: binarizing all fields

In [14]:
!pip install --upgrade scikit-learn


Collecting scikit-learn
  Downloading https://files.pythonhosted.org/packages/92/ea/82a9ae87428ce6fb0956c89b1947cc6a70f6c6548b1b2b9da34c4511fe0d/scikit_learn-0.20.4-cp27-cp27m-win_amd64.whl (4.9MB)
Installing collected packages: scikit-learn
  Found existing installation: scikit-learn 0.20.3
    Uninstalling scikit-learn-0.20.3:
      Successfully uninstalled scikit-learn-0.20.3
Successfully installed scikit-learn-0.20.4


DEPRECATION: Python 2.7 will reach the end of its life on January 1st, 2020. Please upgrade your Python as Python 2.7 won't be maintained after that date. A future version of pip will drop support for Python 2.7. More details about Python 2 support in pip, can be found at https://pip.pypa.io/en/latest/development/release-process/#python-2-support


In [13]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler,StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
import numpy as np

In [14]:
train_data = pd.read_csv('my_train.csv').astype(str)
dev_data = pd.read_csv('my_dev.csv').astype(str)

In [15]:
encoder = OneHotEncoder(handle_unknown='ignore')

In [16]:
X_train = encoder.fit_transform(train_data.drop(['Id', 'SalePrice'], axis=1))
y_train = np.log(train_data['SalePrice'].astype(float))

X_dev = encoder.transform(dev_data.drop(['Id', 'SalePrice'], axis=1))
y_dev = np.log(dev_data['SalePrice'].astype(float))

In [17]:
# Training the model
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [18]:
predictions = model.predict(X_dev)
rmsle = np.sqrt(mean_squared_log_error(np.exp(y_dev), np.exp(predictions)))
print("RMSLE on the development set:", rmsle)

('RMSLE on the development set:', 0.1520164459135018)


In [20]:
#2-4
# Getting feature names from the OneHotEncoder
feature_names = encoder.get_feature_names(input_features=train_data.drop(['Id', 'SalePrice'], axis=1).columns)

# Getting coefficients from the model
coefficients = model.coef_

# Combining feature names with their coefficients
feature_importance = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

top_positive_features = feature_importance.sort_values(by='Coefficient', ascending=False).head(10)
top_negative_features = feature_importance.sort_values(by='Coefficient', ascending=True).head(10)

print("Top 10 Positive Features:", top_positive_features)
print("Top 10 Negative Features:", top_negative_features)


('Top 10 Positive Features:',       Coefficient               Feature
5902     0.141680            FullBath_3
1204     0.138959         OverallQual_9
1162     0.125190  Neighborhood_StoneBr
4817     0.113036          2ndFlrSF_472
1203     0.108436         OverallQual_8
1398     0.093044      RoofMatl_WdShngl
5185     0.090791        GrLivArea_1192
6062     0.088006          GarageCars_3
1155     0.087118  Neighborhood_NoRidge
878      0.085485          LotArea_8029)
('Top 10 Negative Features:',       Coefficient            Feature
15      -0.190855   MSZoning_C (all)
5877    -0.126895      GrLivArea_968
7012    -0.122757  EnclosedPorch_236
1198    -0.113671      OverallQual_3
2445    -0.108055     BsmtFinSF2_311
907     -0.108055       LotArea_8281
1207    -0.100481      OverallCond_3
6060    -0.091607       GarageCars_1
1195    -0.088998      OverallQual_1
698     -0.087902       LotArea_5000)


In [113]:
#2-5
bias = model.intercept_
print("Bias of the model:", bias)

('Bias of the model:', 12.02978365780805)


In [114]:
#2-6. Predict test.csv

In [39]:
test_data = pd.read_csv('test.csv').astype(str)

In [40]:
test_data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
5,1466,60,RL,75.0,10000,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,4,2010,WD,Normal
6,1467,20,RL,,7980,Pave,,IR1,Lvl,AllPub,...,0,0,,GdPrv,Shed,500,3,2010,WD,Normal
7,1468,60,RL,63.0,8402,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,5,2010,WD,Normal
8,1469,20,RL,85.0,10176,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,2,2010,WD,Normal
9,1470,20,RL,70.0,8400,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,,0,4,2010,WD,Normal


In [41]:
X_test = encoder.transform(test_data.drop('Id', axis=1))
test_predictions = model.predict(X_test)
test_predictions_exp = np.exp(test_predictions)

In [42]:
submission = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': test_predictions_exp})
submission.to_csv('submission.csv', index=False)


In [43]:
submission

Unnamed: 0,Id,SalePrice
0,1461,144203.350084
1,1462,158373.425724
2,1463,189955.072455
3,1464,217767.956985
4,1465,213163.055482
5,1466,193050.619210
6,1467,175873.949277
7,1468,191376.109510
8,1469,167496.697738
9,1470,137842.248749
