### Import Packages 

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
%matplotlib inline

### Import Data

In [2]:
# Import csv files
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')

# Transform to dataframes
df_test = pd.DataFrame(test)
df_train = pd.DataFrame(train)

# Take a look
df_train.head()

Unnamed: 0,id,host_id,host_name,latitude,longitude,room_type,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,price
0,33400953,3400827,Nicole,40.79595,-73.97485,Entire home/apt,3,2,2019-04-21,0.74,1,0,380
1,16965705,3483600,Joshua,40.83498,-73.94214,Private room,2,0,,,1,0,63
2,34819184,156850005,Sharon,40.756,-73.87756,Private room,1,1,2019-05-29,0.73,2,355,77
3,22579551,3231509,Annamaria,40.71775,-74.00607,Entire home/apt,14,13,2019-05-08,0.77,4,365,450
4,14571451,32454701,Maria,40.72824,-73.97824,Entire home/apt,1,198,2019-07-06,5.63,1,21,129


### Clean Data

In [3]:
# Drop id, host_id, host_name, calculated_host_listings_count, availability_365 that is == 0

df_test = df_test.drop(['id','host_id','host_name','calculated_host_listings_count','last_review'], axis=1)
df_train = df_train.drop(['id', 'host_id','host_name', 'calculated_host_listings_count','last_review'], axis=1)

#df_test = df_test.dropna(subset=['reviews_per_month'])
df_train = df_train.dropna(subset=['reviews_per_month'])

#df_test = df_test[df_test.availability_365 != 0]
df_train = df_train[df_train.availability_365 != 0]

In [4]:
# Check to see what types of features/variables we're working with
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16685 entries, 2 to 31290
Data columns (total 8 columns):
latitude             16685 non-null float64
longitude            16685 non-null float64
room_type            16685 non-null object
minimum_nights       16685 non-null int64
number_of_reviews    16685 non-null int64
reviews_per_month    16685 non-null float64
availability_365     16685 non-null int64
price                16685 non-null int64
dtypes: float64(3), int64(4), object(1)
memory usage: 1.1+ MB


In [5]:
# Change availability_365 into a percentage of days out of 365

#df_test.availability_365 = df_test.availability_365/365
#df_train.availability_365 = df_train.availability_365/365

In [6]:
#df_test.availability_365.describe()

In [7]:
# One-hot encode room_type

# Let's take a look at the categories present
#print(df_test.room_type.unique())

# Label encode    
room_type_series_test = pd.Series(df_test.room_type)
room_type_series_train = pd.Series(df_train.room_type)

# Change type to category
cat_room_type_series_test = room_type_series_test.astype('category')
cat_room_type_series_train = room_type_series_train.astype('category')

# Create dummy variables & drop first to prevent multicollinearity

room_type_dummies_test = pd.get_dummies(cat_room_type_series_test) #, drop_first=True
room_type_dummies_train = pd.get_dummies(cat_room_type_series_train) #, drop_first=True

# Remove original column from data set
df_test = df_test.drop(['room_type'], axis=1)
df_train = df_train.drop(['room_type'], axis=1)
                                            
# Add new columns in
df_test = pd.concat([df_test, room_type_dummies_test], axis=1)
df_train = pd.concat([df_train, room_type_dummies_train], axis=1) 

# Remove original column from data set
df_test = df_test.drop(['Shared room'], axis=1)
df_train = df_train.drop(['Shared room'], axis=1)
                                            
# Take a look
df_train.head()                                           

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,availability_365,price,Entire home/apt,Private room
2,40.756,-73.87756,1,1,0.73,355,77,0,1
3,40.71775,-74.00607,14,13,0.77,365,450,1,0
4,40.72824,-73.97824,1,198,5.63,21,129,1,0
9,40.74846,-73.97611,2,5,0.99,62,263,1,0
13,40.69474,-73.83038,1,7,0.15,344,55,0,1


In [8]:
# fig = plt.figure(figsize = (8,8))
# ax = fig.gca()
# df_test.hist(ax = ax);

In [9]:
# features = ['minimum_nights', 'number_of_reviews', 'reviews_per_month']

# for feature in features:
#     print(f"{feature}: \n")
#     print(df_test[feature].describe())
#     print(f"# of zeros: {df_test[feature].isin([0]).sum()}")
#     print(f"\n")

In [10]:
# Bin minimum_nights, number_of_reviews (because they contain zeros)

bins_min_nights = [0,1,3,5,500]
bins_num_reviews = [0,2,10,39,458]

# Bin data & return dummies
def binned_dummies(data, features, bins):
    data_bins = pd.cut(data, bins)
    data_bins = data_bins.cat.as_unordered()
    dummies = pd.get_dummies(data_bins, prefix = features, drop_first=True)
    return dummies

dummies_min_nights_test = binned_dummies(df_test.minimum_nights,'minimum_nights', bins_min_nights)
dummies_min_nights_train = binned_dummies(df_train.minimum_nights,'minimum_nights', bins_min_nights)

dummies_num_reviews_test = binned_dummies(df_test.number_of_reviews,'number_of_reviews', bins_num_reviews)
dummies_num_reviews_train = binned_dummies(df_train.number_of_reviews,'number_of_reviews', bins_num_reviews)

# Remove original column from data set
df_test = df_test.drop(['minimum_nights', 'number_of_reviews'], axis=1)
df_train = df_train.drop(['minimum_nights', 'number_of_reviews'], axis=1)
                                            
# Add new columns in
df_test = pd.concat([df_test, dummies_min_nights_test, dummies_num_reviews_test], axis=1)
df_train = pd.concat([df_train, dummies_min_nights_train, dummies_num_reviews_train], axis=1)

In [11]:
# Log Transform 

#features = ['availability_365','reviews_per_month']
features = ['reviews_per_month']
#features = ['reviews_per_month','minimum_nights', 'number_of_reviews',]

# Continuous variables
df_test_features = df_test[features]
df_train_features = df_train[features]

# Add '_log' to continuous variable column names
log_names1 = [f'{column}_log' for column in df_test_features.columns]
log_names2 = [f'{column}_log' for column in df_train_features.columns]

# Log transform continuous variables
# Test
df_test_features_log = np.log(df_test_features)
df_test_features_log.columns = log_names1
# Train
df_train_features_log = np.log(df_train_features)
df_train_features_log.columns = log_names2

### Normalize (subract mean and divide by std)

# Define function to normalize
def normalize(feature):
    return (feature - feature.mean()) / feature.std()

# Apply function to normalize
df_test_features_log_norm = df_test_features_log.apply(normalize)
df_train_features_log_norm = df_train_features_log.apply(normalize)

# Remove original column from data set
df_test = df_test.drop(features, axis=1)
df_train = df_train.drop(features, axis=1)

# Add new columns in
df_test = pd.concat([df_test, df_test_features_log_norm], axis=1)
df_train = pd.concat([df_train, df_train_features_log_norm], axis=1)

In [12]:
df_train.head()

Unnamed: 0,latitude,longitude,availability_365,price,Entire home/apt,Private room,"minimum_nights_(1, 3]","minimum_nights_(3, 5]","minimum_nights_(5, 500]","number_of_reviews_(2, 10]","number_of_reviews_(10, 39]","number_of_reviews_(39, 458]",reviews_per_month_log
2,40.756,-73.87756,355,77,0,1,0,0,0,0,0,0,-0.269363
3,40.71775,-74.00607,365,450,1,0,0,0,1,0,1,0,-0.226242
4,40.72824,-73.97824,21,129,1,0,0,0,0,0,0,1,1.381898
9,40.74846,-73.97611,62,263,1,0,1,0,0,1,0,0,-0.023099
13,40.69474,-73.83038,344,55,0,1,0,0,0,1,0,0,-1.548463


In [13]:
abs(df_train.corr()) > 0.75

Unnamed: 0,latitude,longitude,availability_365,price,Entire home/apt,Private room,"minimum_nights_(1, 3]","minimum_nights_(3, 5]","minimum_nights_(5, 500]","number_of_reviews_(2, 10]","number_of_reviews_(10, 39]","number_of_reviews_(39, 458]",reviews_per_month_log
latitude,True,False,False,False,False,False,False,False,False,False,False,False,False
longitude,False,True,False,False,False,False,False,False,False,False,False,False,False
availability_365,False,False,True,False,False,False,False,False,False,False,False,False,False
price,False,False,False,True,False,False,False,False,False,False,False,False,False
Entire home/apt,False,False,False,False,True,True,False,False,False,False,False,False,False
Private room,False,False,False,False,True,True,False,False,False,False,False,False,False
"minimum_nights_(1, 3]",False,False,False,False,False,False,True,False,False,False,False,False,False
"minimum_nights_(3, 5]",False,False,False,False,False,False,False,True,False,False,False,False,False
"minimum_nights_(5, 500]",False,False,False,False,False,False,False,False,True,False,False,False,False
"number_of_reviews_(2, 10]",False,False,False,False,False,False,False,False,False,True,False,False,False


In [15]:
X_train = df_train.drop(columns=['price'])
X_test = df_test

y_train = df_train.price

In [16]:
X_train = X_train.loc[:,~X_train.columns.duplicated()]
X_test = X_test.loc[:,~X_test.columns.duplicated()]

#y_train = y_train.loc[:,~y_train.columns.duplicated()]

In [17]:
X_train = X_train.drop(columns=['minimum_nights_(1, 3]','minimum_nights_(3, 5]'])
X_test = X_test.drop(columns=['minimum_nights_(1, 3]','minimum_nights_(3, 5]'])

In [18]:
linreg = LinearRegression()
linreg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [19]:
crossvalidation = KFold(n_splits=10, shuffle=True, random_state=1)

baseline_R2 = np.mean(cross_val_score(linreg, X_train, y_train, scoring='r2', cv=crossvalidation))


from itertools import combinations

# code to find top interactions by R^2 value here

# Use combinations from itertools to create all possible combinations of two features
feat_combinations = combinations(X_train.columns, 2)

# Empty list to fill for interactons values
interactions = []

# for i, (feature1,feature2) in feature_combinations:
for i, (a, b) in enumerate(feat_combinations):
    # fill interatctions list with feature a * feature b
    X_train['interaction'] = X_train[a] * X_train[b]
    R2 = np.mean(cross_val_score(linreg, X_train, y_train, scoring='r2', cv=crossvalidation))
    if R2 > baseline_R2:
        interactions.append((a, b, round(R2,5)))
            
print('Top 3 interactions: %s' %sorted(interactions, key=lambda inter: inter[2], reverse=True)[:3])

Top 3 interactions: [('latitude', 'longitude', 0.1829), ('availability_365', 'Entire home/apt', 0.17356), ('availability_365', 'Private room', 0.17264)]


In [20]:
X_train.drop(['interaction'],axis=1)

Unnamed: 0,latitude,longitude,availability_365,Entire home/apt,Private room,"minimum_nights_(5, 500]","number_of_reviews_(2, 10]","number_of_reviews_(10, 39]","number_of_reviews_(39, 458]",reviews_per_month_log
2,40.75600,-73.87756,355,0,1,0,0,0,0,-0.269363
3,40.71775,-74.00607,365,1,0,1,0,1,0,-0.226242
4,40.72824,-73.97824,21,1,0,0,0,0,1,1.381898
9,40.74846,-73.97611,62,1,0,0,1,0,0,-0.023099
13,40.69474,-73.83038,344,0,1,0,1,0,0,-1.548463
...,...,...,...,...,...,...,...,...,...,...
31278,40.84024,-73.85836,54,0,1,0,0,1,0,-0.047972
31280,40.73309,-74.00482,3,1,0,0,0,0,1,1.033796
31282,40.76383,-73.96870,7,1,0,0,1,0,0,0.772785
31284,40.69348,-73.95927,298,0,1,0,0,1,0,-1.548463


In [21]:
# code here
linreg = LinearRegression()
crossval = KFold(n_splits=10, shuffle=True, random_state=1)
final_train = X_train.copy()
final_test = X_test.copy()

final_train['lat*lon'] = final_train['latitude'] * final_train['longitude']
final_train['avail_365*entire_home'] = final_train['availability_365'] * final_train['Entire home/apt']

final_test['lat*lon'] = final_test['latitude'] * final_test['longitude']
final_test['avail_365*entire_home'] = final_test['availability_365'] * final_test['Entire home/apt']

final_model_R2 = np.mean(cross_val_score(linreg, final_train, y_train, scoring='r2', cv=crossval))

final_model_R2 # Same as the answer above for this interaction! 

0.18616901943251113

In [22]:
import statsmodels.api as sm
X_int = sm.add_constant(final_train)
model = sm.OLS(y_train,X_int).fit()
model.summary()

  return ptp(axis=axis, out=out, **kwargs)


0,1,2,3
Dep. Variable:,price,R-squared:,0.139
Model:,OLS,Adj. R-squared:,0.138
Method:,Least Squares,F-statistic:,206.5
Date:,"Sun, 26 Apr 2020",Prob (F-statistic):,0.0
Time:,23:37:15,Log-Likelihood:,-110820.0
No. Observations:,16685,AIC:,221700.0
Df Residuals:,16671,BIC:,221800.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.803e+07,1.32e+06,13.654,0.000,1.54e+07,2.06e+07
latitude,-4.44e+05,3.25e+04,-13.683,0.000,-5.08e+05,-3.8e+05
longitude,2.44e+05,1.79e+04,13.660,0.000,2.09e+05,2.79e+05
availability_365,0.0677,0.017,4.088,0.000,0.035,0.100
Entire home/apt,113.4422,10.149,11.177,0.000,93.548,133.336
Private room,32.2123,9.209,3.498,0.000,14.162,50.262
"minimum_nights_(5, 500]",-43.2465,4.184,-10.336,0.000,-51.448,-35.045
"number_of_reviews_(2, 10]",-13.7679,4.756,-2.895,0.004,-23.090,-4.446
"number_of_reviews_(10, 39]",-25.0895,4.936,-5.083,0.000,-34.765,-15.414

0,1,2,3
Omnibus:,38287.069,Durbin-Watson:,2.016
Prob(Omnibus):,0.0,Jarque-Bera (JB):,486542040.04
Skew:,22.059,Prob(JB):,0.0
Kurtosis:,838.407,Cond. No.,2780000000.0


In [23]:
final_train = final_train.drop(['interaction'],axis=1)

In [24]:
final_test.head()

Unnamed: 0,latitude,longitude,availability_365,Entire home/apt,Private room,"minimum_nights_(5, 500]","number_of_reviews_(2, 10]","number_of_reviews_(10, 39]","number_of_reviews_(39, 458]",reviews_per_month_log,lat*lon,avail_365*entire_home
0,40.68742,-73.91628,341,1,0,0,0,1,0,0.583681,-3007.462729,341
1,40.70865,-73.96673,333,1,0,1,1,0,0,1.152836,-3011.085723,333
2,40.70716,-74.01155,0,1,0,0,0,0,0,,-3012.800008,0
3,40.73198,-74.00526,219,1,0,1,1,0,0,0.788151,-3014.38077,219
4,40.62515,-73.94409,82,1,0,0,0,1,0,0.89613,-3003.989748,82


In [39]:
final_test.reviews_per_month_log = final_test.reviews_per_month_log.fillna(0)

In [41]:
linreg.fit(final_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [42]:
y_hat = linreg.predict(final_test)
len(final_test)

7824

In [43]:
ids = test.id.values.reshape(-1,1)
prices = y_hat.reshape(-1,1)

In [44]:
len(ids)
len(prices)

7824

In [45]:
data = np.concatenate((ids,prices),axis=1)
df_final = pd.DataFrame(data=data, columns=['id','price'])
df_final = df_final.astype({'id': 'int32'})
df_final.head()

Unnamed: 0,id,price
0,21399885,227.992868
1,35037136,220.856896
2,13330602,224.75394
3,33951037,230.305907
4,28692911,158.841037


In [46]:
# File for submittion
df_final.to_csv('submission3.csv', index=False)