# Import Libraries and Data

In [26]:
import helper

In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [28]:
real_estate = pd.read_csv('../datasets/processed_train.csv')

In [29]:
def simple_interaction_feature(df, list_of_old_features):
    new_feature_val = 1
    for feature in list_of_old_features:
        new_feature_val = new_feature_val * df[feature]
    return new_feature_val

# Engineering Features

pool_score
garage_score
fireplace_score
basement_score
kitchen_score
overall_score

Pool = Pool Area * Pool QC
Garage = Garage Area * Garage Qual * Garage Cars * Garage Cond
Fireplace = Fireplaces * FireplacesQu
Basement = Total Bsmt SF * Bsmt Qual * Bsmt Cond
Kitchen = Kitchen * Kitchen Qual
Overall = Overall Qual * Overall Cond

## Pool

In [30]:
real_estate['pool_score'] = simple_interaction_feature(real_estate, ['pool_area','pool_qc'])

In [31]:
real_estate['pool_score'].value_counts()

0       2042
1038       1
1104       1
1140       1
1296       1
1920       1
2304       1
2952       1
3200       1
1683       1
Name: pool_score, dtype: int64

## Garage

In [32]:
real_estate['garage_score'] = simple_interaction_feature(real_estate, ['garage_area','garage_qual','garage_cars','garage_cond'])

In [33]:
real_estate['garage_score'].value_counts()

0.0        114
7920.0      68
10368.0     68
8712.0      51
9504.0      43
          ... 
8226.0       1
12348.0      1
2925.0       1
22032.0      1
10872.0      1
Name: garage_score, Length: 641, dtype: int64

## Fireplace

In [34]:
real_estate['fireplace_score'] = simple_interaction_feature(real_estate, ['fireplaces','fireplace_qu'])

In [35]:
real_estate['fireplace_score'].value_counts()

0     1000
4      456
3      342
8       74
6       62
2       49
1       31
5       29
12       3
9        3
10       2
Name: fireplace_score, dtype: int64

## Basement

In [36]:
real_estate['basement_score'] = simple_interaction_feature(real_estate, ['total_bsmt_sf','bsmt_qual','bsmt_cond'])

In [37]:
real_estate['basement_score'].value_counts()

0.0        55
7776.0     44
9072.0     20
8208.0     17
6048.0     16
           ..
4200.0      1
10632.0     1
7533.0      1
3180.0      1
9243.0      1
Name: basement_score, Length: 1156, dtype: int64

## Kitchen

In [38]:
real_estate['kitchen_score'] = simple_interaction_feature(real_estate, ['kitchen_abvgr','kitchen_qual'])

In [39]:
real_estate['kitchen_score'].value_counts()

3    965
4    809
5    151
6     79
2     41
8      3
0      2
9      1
Name: kitchen_score, dtype: int64

## Overall

In [40]:
real_estate['overall_score'] = simple_interaction_feature(real_estate, ['overall_qual','overall_cond'])

In [41]:
real_estate['overall_score'].value_counts()

35    440
30    399
40    265
25    212
42    125
36    124
45     81
20     75
48     48
24     46
49     34
28     31
16     27
56     24
50     21
15     21
12     12
63     11
18      8
9       6
54      5
32      5
72      4
6       4
8       4
3       3
10      3
4       3
90      2
5       2
64      2
21      2
1       1
27      1
Name: overall_score, dtype: int64

# Modeling

In [42]:
from sklearn.linear_model import LinearRegression

In [43]:
linreg = LinearRegression()

In [44]:
features = ['pool_score','garage_score','fireplace_score','basement_score','kitchen_score','overall_score']

In [45]:
X = real_estate[features]

In [46]:
y = real_estate[['saleprice']]

In [47]:
linreg.fit(X,y)

LinearRegression()

# Scoring

In [48]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [49]:
scores = cross_val_score(linreg, X, y, cv=5)

In [50]:
scores.mean()

0.7449649168471161

In [51]:
predicts = linreg.predict(X)

In [52]:
r2_score(real_estate['saleprice'],predicts)

0.7564316965542196

In [56]:
mean_squared_error(real_estate['saleprice'],predicts,squared=False)

39106.70343626019

In [54]:
mean_squared_error(real_estate['saleprice'],predicts)

1529334253.6516044

# To Submission

In [28]:
test = pd.read_csv('../datasets/test.csv')

In [29]:
test.columns = (test.columns.str.lower()).str.replace(' ','_')

## Feature Engineering

In [30]:
ordinals = ['exter_qual','exter_cond','bsmt_qual','bsmt_cond','heating_qc',
            'kitchen_qual','fireplace_qu','garage_qual','garage_cond','pool_qc']

In [31]:
helper.mass_ordinal_to_numerical(test, ordinals)

### Pool

In [32]:
test['pool_score'] = simple_interaction_feature(test, ['pool_area','pool_qc'])

In [33]:
test['pool_score'].value_counts()

0.0       874
2560.0      1
2775.0      1
1332.0      1
720.0       1
Name: pool_score, dtype: int64

### Garage

In [34]:
test['garage_score'] = simple_interaction_feature(test, ['garage_area','garage_qual','garage_cars','garage_cond'])

In [35]:
test['garage_score'].value_counts()

0.0        45
10368.0    26
7920.0     24
8712.0     23
2160.0     20
           ..
1600.0      1
516.0       1
14310.0     1
16464.0     1
8190.0      1
Name: garage_score, Length: 428, dtype: int64

### Fireplace

In [36]:
test['fireplace_score'] = simple_interaction_feature(test, ['fireplaces','fireplace_qu'])

In [37]:
test['fireplace_score'].value_counts()

0.0     422
4.0     180
3.0     158
8.0      38
6.0      33
2.0      15
1.0      15
5.0       8
10.0      3
12.0      3
9.0       2
15.0      1
Name: fireplace_score, dtype: int64

### Basement

In [38]:
test['basement_score'] = simple_interaction_feature(test, ['total_bsmt_sf','bsmt_qual','bsmt_cond'])

In [39]:
test['basement_score'].value_counts()

0.0        25
7776.0     18
6912.0      9
7020.0      8
9072.0      8
           ..
10980.0     1
9558.0      1
8541.0      1
13800.0     1
10272.0     1
Name: basement_score, Length: 610, dtype: int64

### Kitchen

In [40]:
test['kitchen_score'] = simple_interaction_feature(test, ['kitchen_abvgr','kitchen_qual'])

In [41]:
test['kitchen_score'].value_counts()

3    409
4    355
5     53
6     36
2     20
8      2
0      1
1      1
9      1
Name: kitchen_score, dtype: int64

### Overall

In [47]:
test['overall_score'] = simple_interaction_feature(test, ['overall_qual','overall_cond'])

In [48]:
test['overall_score'].value_counts()

35    194
30    178
40    105
25     88
42     57
36     52
20     44
45     33
48     23
24     21
28     15
12      9
49      9
18      7
15      6
50      6
56      5
72      5
16      4
10      4
63      2
32      2
9       2
4       1
54      1
6       1
60      1
5       1
64      1
2       1
Name: overall_score, dtype: int64

## Predict

In [50]:
test['SalePrice'] = linreg.predict(test[features])

In [53]:
submission = test[['id','SalePrice']]

In [54]:
submission.columns = ['Id','SalePrice']

In [57]:
submission.to_csv('../submissions/iter2.csv',index=False)