# Imports

In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

%pylab inline
%config InlineBackend.figure_formats = ['retina']
%matplotlib inline

Populating the interactive namespace from numpy and matplotlib


# Load Data

In [12]:
df = pd.read_csv('/Users/johnmetzger/Desktop/Coding/Projects/Project2/df.csv')

# Take a look at the datatypes
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2878 entries, 0 to 2877
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   price             2878 non-null   int64
 1   hours             2878 non-null   int64
 2   day               2878 non-null   int64
 3   title_char_count  2878 non-null   int64
 4   has_pics          2878 non-null   int64
dtypes: int64(5)
memory usage: 112.5 KB


---
QC check on dataframe

In [13]:
df

Unnamed: 0,price,hours,day,title_char_count,has_pics
0,65,18,19,37,0
1,700,18,19,46,0
2,75,18,19,28,1
3,10,18,19,16,1
4,775,18,19,27,1
...,...,...,...,...,...
2873,40,9,17,66,1
2874,900,9,17,17,1
2875,250,8,17,43,1
2876,1800,8,17,32,1


# Initial 80-20 split

* This will set aside 80% of the data for training and validation in various models. 
* The other 20% of the data is set aside as 'test' data and will only be used in the final steps to test a chosen model.
* Use 'Train' data as the input for the validation step.

In [14]:
X = df[['hours','day','title_char_count','has_pics']]
y = df['price']
X_core, X_test, y_core, y_test =train_test_split(X, y, test_size=0.2, 
                                                 random_state=42)

X_train, X_val, y_train, y_val =train_test_split(X_core, y_core, test_size=0.2, 
                                                 random_state=42)

In [15]:
def split_and_validate(X, y):
    '''
    For a set of features and target X, y, perform a 80/20 train/val split, 
    fit and validate a linear regression model, and report results
    '''
    # Train-test split makes a random subsample
    # Then performs train-val split
    X_train, X_val, y_train, y_val = \
        train_test_split(X, y, test_size=0.2, random_state=42)
    
    ## Random state is initializing from a random state.  This
    ## helps make results reproducible.
    
    ### Test size = 0.2 means reserve 20% of data for testing
    
    #### DO ONE HOT ENCODING BETWEEN THIS STEP
        # But only do it on specific columns 
    
    # Fit linear regression to training data
    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)
    
    # Score fit model on validation data
    val_score = lr_model.score(X_val, y_val)
    
    # Report results
    print('\nValidation R^2 score was:', val_score)
    print('Feature coefficient results: \n')
    for feature, coef in zip(X.columns, lr_model.coef_):
        print(feature, ':', f'{coef:.2f}')

---
The Validation score here is the r^2 for the "baseline feature set" (a la 'feature_engineering' lesson).

In [16]:
split_and_validate(X_train, y_train)


Validation R^2 score was: -0.0081870876555592
Feature coefficient results: 

hours : 3.54
day : -10.48
title_char_count : 1.33
has_pics : 177.43


# Polynomial Model
**NOTE**
* Make sure to train on same training set the linear model used

## Training score

* In general, don't mess with the 'degree' much.  2 is best for now.

In [17]:
m = LinearRegression()
p = PolynomialFeatures(degree=2,interaction_only=True)
p.fit(X_train)
x_train_poly = p.transform(X_train)

m.fit(x_train_poly,y_train)
m.score(x_train_poly,y_train)
print('Training R^2 score: ', m.score(x_train_poly,y_train))
print('Training coef: ', m.coef_)

Training R^2 score:  0.0030017034863207837
Training coef:  [  0.         209.93179737 127.54224666 -20.09956244 506.30265369
 -12.66433904   0.59616572   2.27264402   0.6166668  -30.71195179
   2.97589299]


## Validation score

**NOTE:** Do not re-fit poly

In [18]:
x_val_poly = p.transform(X_val)
m.score(x_val_poly,y_val)

0.002672751067739232

# Validation comparison 
- compare validation scores (t^2) 
- choose the best value
- take the 'X_test' data and make sure you do any transformations needed to match the chosen model features (e.g., polynomial transformation)
    - then you can score it with the linear regression to get an R^2.

---
For this study, the polynomial model is a better fit so I need to transform the X_test
and then score it.

## Polynomial is a better fit so I need to transform the X_test

In [19]:
x_val_poly_test = p.transform(X_test)
m.score(x_val_poly_test, y_test)

0.004119661954509524

Check feature names to see what operations are being done.

In [20]:
p.get_feature_names()

['1',
 'x0',
 'x1',
 'x2',
 'x3',
 'x0 x1',
 'x0 x2',
 'x0 x3',
 'x1 x2',
 'x1 x3',
 'x2 x3']

# Check to see if polynomial model gives a reasonable (root mean squared error) RMSE and mean absolute error (MAE)

You need to make prediction data first

In [21]:
lr = LinearRegression()
lr.fit(x_val_poly_test, y_test)
y_pred = lr.predict(x_val_poly_test)
y_pred

array([ 600.33772628,  561.48262471,  872.71044794,  162.30299401,
         54.97363867,  695.42456456,  857.06571441, 1083.25985663,
        946.31490046,  567.53697778,  507.88859358,  703.38633658,
        592.50019257,  747.72783033,  595.92128248,  635.76109851,
        596.92162194,  567.73579192,  585.72159561,  765.64547463,
        866.68943826,  689.51168943,  683.5988143 ,  615.86794849,
        671.85799768,  641.84004432,  586.00644733,  705.78705969,
        482.18341605,  755.9019156 ,  747.56365247,  983.83465776,
       1078.1844475 ,  612.95552897, 1055.71744783,  712.02462624,
       1411.56047724,  565.24651078, 1305.16354835, 1414.41618479,
        691.72589266,  680.57448248,  585.22550284, 1129.55294739,
        749.89354027, 1069.71146379, 1013.27224462,  585.69451068,
        804.25338302,  742.72756562, 1003.78851553,  790.20376619,
        666.732511  ,  788.00184116, 1095.58021687,  744.86256502,
        637.47274089,  938.56632474,  618.16427704,  771.61764

Check predicted and test sizes using len().  They should match.

In [22]:
len(y_pred)

576

In [23]:
len(y_test)

576

# Mean Absolute Error

This gives you the error in units of your test variable, which is useful for explaining how useful your model is for an intended purpose.

In [25]:
mean_absolute_error(y_test,y_pred)

781.7045453489694

# ROOT MEAN SQUARED ERROR

In [None]:
Similar to MAE, this can be used to understand the usefulness of a model for a given task by using the same units at the target variable. 

In [27]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
rmse             

2039.105334233029

In [11]:
p.get_feature_names()

['1',
 'x0',
 'x1',
 'x2',
 'x3',
 'x0 x1',
 'x0 x2',
 'x0 x3',
 'x1 x2',
 'x1 x3',
 'x2 x3']

# Mean and Median Prices

In [46]:
mean(df['price'])

685.1567060458652

In [47]:
median(df['price'])

200.0