## Attempts Will Be Made

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

X_train = pd.read_csv('../src/Xtrain.csv')
y_train = pd.read_csv('../src/ytrain.csv')

X_test = pd.read_csv('../src/Xtest.csv')

In [2]:
X_train.shape, y_train.shape, X_test.shape


((16197, 19), (16197, 1), (5400, 19))

In [3]:
X_train.columns, y_train.columns

(Index(['date', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
        'waterfront', 'view', 'condition', 'grade', 'sqft_above',
        'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',
        'sqft_living15', 'sqft_lot15'],
       dtype='object'),
 Index(['price'], dtype='object'))

In [4]:
X_train.dtypes, X_train.isna().sum()

(date              object
 bedrooms           int64
 bathrooms        float64
 sqft_living        int64
 sqft_lot           int64
 floors           float64
 waterfront       float64
 view             float64
 condition          int64
 grade              int64
 sqft_above         int64
 sqft_basement     object
 yr_built           int64
 yr_renovated     float64
 zipcode            int64
 lat              float64
 long             float64
 sqft_living15      int64
 sqft_lot15         int64
 dtype: object,
 date                0
 bedrooms            0
 bathrooms           0
 sqft_living         0
 sqft_lot            0
 floors              0
 waterfront       1756
 view               49
 condition           0
 grade               0
 sqft_above          0
 sqft_basement       0
 yr_built            0
 yr_renovated     2879
 zipcode             0
 lat                 0
 long                0
 sqft_living15       0
 sqft_lot15          0
 dtype: int64)

In [5]:
##Joining dataframes in order to drop values, then split them back up

training_df = pd.concat([y_train, X_train], axis = 1)
training_df

Unnamed: 0,price,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,529000.0,3/4/2015,3,2.50,1880,4499,2.0,0.0,0.0,3,8,1880,0.0,1993,0.0,98029,47.5664,-121.999,2130,5114
1,253000.0,10/7/2014,3,2.50,2020,6564,1.0,0.0,0.0,3,7,1310,710.0,1994,0.0,98042,47.3545,-122.158,1710,5151
2,745000.0,1/16/2015,5,4.00,4720,493534,2.0,0.0,0.0,5,9,3960,760.0,1975,0.0,98027,47.4536,-122.009,2160,219542
3,545000.0,3/30/2015,2,2.00,1430,3880,1.0,0.0,0.0,4,7,1430,0.0,1949,0.0,98117,47.6844,-122.392,1430,3880
4,390000.0,10/14/2014,3,2.25,2270,32112,1.0,0.0,0.0,4,8,1740,530.0,1980,0.0,98042,47.3451,-122.094,2310,41606
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16192,440000.0,9/15/2014,3,2.50,2230,5800,2.0,0.0,0.0,3,7,2230,0.0,2004,0.0,98065,47.5308,-121.847,2230,6088
16193,572000.0,10/2/2014,4,2.75,2770,3852,2.0,0.0,0.0,3,8,2770,0.0,2014,,98178,47.5001,-122.232,1810,5641
16194,299800.0,7/21/2014,4,1.50,1530,9000,1.0,0.0,0.0,4,6,1530,0.0,1976,0.0,98014,47.6492,-121.908,1520,8500
16195,245000.0,6/20/2014,1,0.75,380,15000,1.0,0.0,0.0,3,5,380,0.0,1963,0.0,98168,47.4810,-122.323,1170,15000


In [6]:
##Calculated the percent of waterfront homes in the existing data to determine what to do with NaN vals
##Because they accounted for less than 1 percent of properties, I replaced NaN values with 0. 

len(training_df[training_df['waterfront'] == 1.0]), len(training_df[training_df['waterfront'] == 0.0])
percent_waterfront = 111/14330
percent_waterfront, training_df['waterfront'].isna().sum()

X_test.isna().sum()

date               0
bedrooms           0
bathrooms          0
sqft_living        0
sqft_lot           0
floors             0
waterfront       620
view              14
condition          0
grade              0
sqft_above         0
sqft_basement      0
yr_built           0
yr_renovated     963
zipcode            0
lat                0
long               0
sqft_living15      0
sqft_lot15         0
dtype: int64

In [7]:
training_df['waterfront'].replace(np.nan, 0, inplace = True)

training_df.isna().sum()
##

X_test['waterfront'].replace(np.nan, 0, inplace = True)

In [8]:
##Did the same thing with renovations but turned it into a categorical variable - Only 550 or so had reported renovations,
## So reported renovations became 1, unreported or reported as 0.0 all became 0

training_df['yr_renovated'].replace(np.nan, 0, inplace = True)

training_df['renovated'] = np.where((training_df['yr_renovated'] > 0.0), 1, 0)

training_df['renovated'].sum()/len(training_df['renovated'])


##

X_test['yr_renovated'].replace(np.nan, 0, inplace = True)
X_test['renovated'] = np.where((X_test['yr_renovated'] > 0), 1, 0)

In [9]:
#training_df.head()

In [10]:
## Converted str dates to datetime objects

x_date = pd.DataFrame(pd.to_datetime(training_df['date'], format='%m/%d/%Y'))

x_test_date = pd.DataFrame(pd.to_datetime(X_test['date'], format='%m/%d/%Y'))

In [11]:
training_df['date'] = x_date

X_test['date'] = x_test_date

In [12]:
training_df.view.replace(np.nan, 0, inplace = True)
##
X_test.view.replace(np.nan, 0, inplace = True)

In [13]:
training_df.drop(columns = ['yr_renovated'], axis = 1, inplace = True)
##
X_test.drop(columns = ['yr_renovated'], axis = 1, inplace = True)

In [14]:
training_df['quarter'] = training_df['date'].dt.quarter
##
X_test['quarter'] = X_test['date'].dt.quarter

In [15]:
training_df.info(), X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16197 entries, 0 to 16196
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   price          16197 non-null  float64       
 1   date           16197 non-null  datetime64[ns]
 2   bedrooms       16197 non-null  int64         
 3   bathrooms      16197 non-null  float64       
 4   sqft_living    16197 non-null  int64         
 5   sqft_lot       16197 non-null  int64         
 6   floors         16197 non-null  float64       
 7   waterfront     16197 non-null  float64       
 8   view           16197 non-null  float64       
 9   condition      16197 non-null  int64         
 10  grade          16197 non-null  int64         
 11  sqft_above     16197 non-null  int64         
 12  sqft_basement  16197 non-null  object        
 13  yr_built       16197 non-null  int64         
 14  zipcode        16197 non-null  int64         
 15  lat            1619

(None, None)

In [16]:
## Plotted distributions for continuous variables 
#for cat in x_cont:
#    fig, ax = plt.subplots()
#    sns.histplot(training_df[cat])

##Plotted categorical variables

#for cat in x_cat:
#    fig, ax = plt.subplots()
#    sns.histplot(training_df[cat])

In [17]:
##Selecting the features I want to use
x_date = ['date']
x_cat = ['bedrooms', 'bathrooms', 'waterfront', 'condition', 'grade', 'zipcode', 'renovated', 'quarter']
x_cont = ['sqft_living', 'sqft_lot', 'sqft_living15']
y_cont = ['price']


In [18]:
#sns.histplot(training_df['price']);

In [19]:
##Splitting the dataframe back up into X_train and y_train

X_train = pd.concat([training_df[x_cont], training_df[x_cat]], axis = 1)
y_train = pd.DataFrame(training_df['price'])

X_test = pd.concat([X_test[x_cont], X_test[x_cat]], axis = 1)

len(X_test.columns), len(X_train.columns)

(11, 11)

In [20]:
##OHE on selected categoricals
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse = False, handle_unknown = 'ignore')
x_cat_enc = ohe.fit_transform(X_train[x_cat])

x_cat_enc_df = pd.DataFrame(x_cat_enc)


## OHE on X_test
x_test_enc = ohe.transform(X_test[x_cat])

x_test_enc_df = pd.DataFrame(x_test_enc)

len(x_test_enc_df.columns), len(x_cat_enc_df.columns)

(131, 131)

In [21]:
#log transform on sqft variables
log_sqft_living = np.log(X_train['sqft_living'])
log_sqft_lot = np.log(X_train['sqft_lot'])
log_sqft_living15 = np.log(X_train['sqft_living15'])

log_y_train = np.array(np.log(y_train['price']))



#standardizing lat and long to coordinates
#small_x = np.array(np.cos(X_train['lat']) * np.cos(X_train['long']))
#small_y = np.array(pd.DataFrame(np.cos(X_train['lat']) * np.sin(X_train['long'])))
#small_z = np.array(pd.DataFrame(np.sin(X_train['lat'])))



##Power Transform on continuous variables
from sklearn.preprocessing import PowerTransformer

power = PowerTransformer()

power_sqft_living = power.fit_transform(np.array(log_sqft_living).reshape(-1,1)).flatten()
power_sqft_lot = power.fit_transform(np.array(log_sqft_lot).reshape(-1,1)).flatten()
power_sqft_living15 = power.fit_transform(np.array(log_sqft_living15).reshape(-1,1)).flatten()



##Assembly of final DF
final_df = pd.DataFrame([])

final_df['price'] = log_y_train
final_df['sqft_living'] = power_sqft_living
final_df['sqft_lot'] = power_sqft_lot
final_df['sqft_living15'] = power_sqft_living15
#final_df['x'] = small_x
#final_df['y'] = small_y
#final_df['z'] = small_z

final_df = pd.concat([final_df, x_cat_enc_df], axis = 1)


##X_test transforms
tlog_sqft_living = np.log(X_test['sqft_living'])
tlog_sqft_lot = np.log(X_test['sqft_lot'])
tlog_sqft_living15 = np.log(X_test['sqft_living15'])


tpower_sqft_living = power.fit_transform(np.array(tlog_sqft_living).reshape(-1,1)).flatten()
tpower_sqft_lot = power.fit_transform(np.array(tlog_sqft_lot).reshape(-1,1)).flatten()
tpower_sqft_living15 = power.fit_transform(np.array(tlog_sqft_living15).reshape(-1,1)).flatten()

X_test['sqft_living'] = tpower_sqft_living
X_test['sqft_lot'] = tpower_sqft_lot
X_test['sqft_living15'] = tpower_sqft_living15
X_test.drop(columns = x_cat, inplace = True)

X_test = pd.concat([X_test, x_test_enc_df], axis = 1)


In [25]:
X_train = final_df.drop(columns = 'price', axis = 1)
y_train = pd.DataFrame(final_df['price'])


Unnamed: 0,sqft_living,sqft_lot,sqft_living15,0,1,2,3,4,5,6,...,121,122,123,124,125,126,127,128,129,130
0,-0.036255,-0.614087,0.409697,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1,0.133192,-0.152342,-0.260119,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,2.155565,3.505455,0.451269,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,-0.679181,-0.804264,-0.830224,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,0.409042,1.482668,0.649084,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16192,0.366966,-0.300013,0.545596,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
16193,0.881296,-0.813710,-0.083662,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
16194,-0.520719,0.209509,-0.633037,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
16195,-3.733445,0.753849,-1.498335,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [26]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

In [27]:
# instantiate a kfold object and specify number of splits

kf = KFold(n_splits = 8, shuffle = False) ## 4 to 1 ratio of train to test

# designate 5 folds for each split
val_r2 = []
# loop through each fold
kf.get_n_splits(X_train)
print (kf)

for train_ind, test_ind in kf.split(X_train, y_train):
    # logistic regression performed for each fold
    lr = LinearRegression()
    # instantiate a scaler for each fold
    ss = StandardScaler()
    # using the indices, create the split associated with each loop
    X_tt = X_train.iloc[train_ind]
    y_tt =  y_train.iloc[train_ind]
    X_ts =  X_train.iloc[test_ind]
    y_ts = y_train.iloc[test_ind]
    # fit transform the scaler on tt
    X_tt_scaled = ss.fit_transform(X_tt)
    # fit model on tt
    lr.fit(X_tt_scaled, y_tt)
    # score both training and validation
    
    X_ts_scaled = ss.transform(X_ts)
    val_r2.append(lr.score(X_ts_scaled, y_ts))

KFold(n_splits=8, random_state=None, shuffle=False)


In [28]:
val_r2

[0.873918723324306,
 -9.0283578266401e+21,
 0.8722674112480374,
 0.8826836335631796,
 0.8798513451108505,
 0.8738637509266423,
 0.8800455941241699,
 0.8741439672676341]

In [29]:
lr.fit(X_train, y_train)
lr.score(X_train, y_train)

0.8798108126206002

In [31]:
first_preds_log = lr.predict(X_test)

In [65]:
first_preds = np.exp(first_preds_log)
first_preds

  first_preds = np.exp(first_preds_log)


array([[ 208621.26478469],
       [ 485988.74047303],
       [ 397036.96571575],
       ...,
       [1408778.59543864],
       [ 958331.68040576],
       [ 236721.9248688 ]])

In [37]:
import numpy as np
y_test_fake = np.full((5400,1), 0)

In [41]:
from sklearn.metrics import r2_score

# fake predictions using the mean of y_train.
first_preds = np.full((5400,1), np.mean(y_train))

r2_score(first_preds, y_test_fake)

-3.3730650215815687e+30

In [42]:
np.savetxt('Svitlana_Jamie_FirstPreds.csv', first_preds, delimiter=',')

In [59]:
# gradient boosting for classification in scikit-learn
from sklearn.svm import SVR

# define dataset
X = X_train
y = np.array(y_train).flatten()

# fit the model on the whole dataset
model = SVR()
model.fit(X, y)

# make a single prediction

second_preds_log = model.predict(X_test)


In [66]:
second_preds = np.exp(second_preds_log)
second_preds

array([ 203615.11479163,  455275.52840028,  371976.47983475, ...,
       1719372.82900262, 1039211.71260435,  252654.59004289])

In [67]:
np.savetxt('Svitlana_Jamie_SecondPreds.csv', second_preds, delimiter=',')

In [61]:
from sklearn.linear_model import SGDRegressor

X = X_train
y = np.array(y_train).flatten()

grad = SGDRegressor()
grad.fit(X, y)

third_preds_log = grad.predict(X_test)

In [68]:
third_preds = np.exp(third_preds_log)
third_preds

array([ 211357.56523732,  498256.53443616,  388178.40906444, ...,
       1374005.11284913,  778794.11815575,  259790.53516605])

In [69]:
np.savetxt('Svitlana_Jamie_ThirdPreds.csv', third_preds, delimiter=',')