## Simple Kaggle submission
You may use this to test making a submission

In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [4]:
# Reading in data for both train.csv and test.csv

df = pd.read_csv('datasets/train.csv')
kaggle_sub = pd.read_csv('datasets/test.csv')

In [5]:
df.head(2)

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000


In [6]:
# Changing all column names to be lower case and remove spaces for _

df.columns =[col.replace(' ', '_').lower() for col in df.columns]
kaggle_sub.columns =[col.replace(' ', '_').lower() for col in kaggle_sub.columns]

### Modeling

In [7]:
# These features are chosen only because they did not need any EDA. This model is 
# not meant to be good.

features = ['lot_area', 'fireplaces', '1st_flr_sf', 'full_bath',
           'half_bath', 'totrms_abvgrd']

X = df[features]
y = df['saleprice']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   random_state = 42)

In [9]:
# instantiate and fit

lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_train, y_train), lr.score(X_test, y_test)

(0.586908421365044, 0.6437361531661305)

### Predictions

In [10]:
# The predictions need to be done on the same features that we modeled on.
# Creating a variable to hold all our predictions

preds =lr.predict(kaggle_sub[features])

Remember we got and error because our column names were different. Any changes we do on our modeled features we need to do on the data from the test.csv

Here we had simply changed the case of the names and replaced spaces with _ but even superficial changes like that matter.

In [11]:
# changing the test.csv to mirror the train.csv

kaggle_sub.columns =[col.replace(' ', '_').lower() for col in kaggle_sub.columns]

In [12]:
# Creating a new column that is the saleprice. These are the values from the .predict.

kaggle_sub['saleprice'] = preds

In [13]:
# checking 'saleprice' is at the end

kaggle_sub.head(2)

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type,saleprice
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,,,,0,4,2006,WD,159449.713528
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,,,,0,8,2006,WD,251304.668573


## save out

In [14]:
# Format df for Kaggle

kaggle =kaggle_sub[['id', 'saleprice']]

In [16]:
# To csv with no index

kaggle.to_csv('datasets/kaggle_mlr.csv', index=False)