In [9]:
import csv
import pandas as pd
import numpy as np
df = pd.read_csv("train.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74111 entries, 0 to 74110
Data columns (total 29 columns):
id                        74111 non-null int64
log_price                 74111 non-null float64
property_type             74111 non-null object
room_type                 74111 non-null object
amenities                 74111 non-null object
accommodates              74111 non-null int64
bathrooms                 73911 non-null float64
bed_type                  74111 non-null object
cancellation_policy       74111 non-null object
cleaning_fee              74111 non-null bool
city                      74111 non-null object
description               74111 non-null object
first_review              58247 non-null object
host_has_profile_pic      73923 non-null object
host_identity_verified    73923 non-null object
host_response_rate        55812 non-null object
host_since                73923 non-null object
instant_bookable          74111 non-null object
last_review               582

In [10]:
# parsing the amenities variable
df['amenities'] = df['amenities'].str.replace('{', '')
df['amenities'] = df['amenities'].str.replace('}', '')
df['amenities'] = df['amenities'].str.replace('"', '')

# make a tuple to store all the amenities
amenities_all = np.unique(np.concatenate(df['amenities'].map(lambda x: x.split(","))))

# match each record's amenities
for amenity in amenities_all:
    df[amenity] = df['amenities'].map(lambda x: True if x.find(amenity) > 0 else False)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74111 entries, 0 to 74110
Columns: 160 entries, id to translation missing: en.hosting_amenity_50
dtypes: bool(132), float64(7), int64(3), object(18)
memory usage: 25.2+ MB


In [12]:
# exclude variables that are meaningless or are out of this project's scope
df = df.drop(['id', 'thumbnail_url', 'amenities', 'latitude', 'longitude', 'name', 'description', 'neighbourhood'], axis=1)

In [7]:
# exclude variables that do not exist at the initial listing
df = df.drop(['first_review', 'last_review', 'number_of_reviews', 'review_scores_rating', 'host_response_rate', 'host_since'], axis=1)

KeyError: "labels ['first_review' 'last_review' 'number_of_reviews' 'review_scores_rating'\n 'host_response_rate' 'host_since'] not contained in axis"

In [13]:
# transform true/false features to binary
df['host_has_profile_pic'] = df['host_has_profile_pic'].map(lambda x: 0 if x == "f" else 1)
df['host_identity_verified'] = df['host_identity_verified'].map(lambda x: 0 if x == "f" else 1)
df['instant_bookable'] = df['instant_bookable'].map(lambda x: 0 if x == "f" else 1)
df['cleaning_fee'] = df['cleaning_fee'].map(lambda x: 0 if x == False else 1)

In [14]:
# clean zipcode
df['zipcode'] = df['zipcode'].astype(str)
df['zipcode']=df['zipcode'].map(lambda x: x.strip('.0'))
df['zipcode']=df['zipcode'].map(lambda x: x.strip('lm'))
df['zipcode']=df['zipcode'].map(lambda x: x.strip('Near'))
df['zipcode']=df['zipcode'].map(lambda x: x.replace("\n", ""))
df['zipcode']=df['zipcode'].map(lambda x: x.replace("\r", ""))
df['zipcode']=df['zipcode'].map(lambda x: x.replace("-", ""))
df['zipcode']=df['zipcode'].map(lambda x: x.replace(" ", ""))

df['zipcode']=df['zipcode'].map(lambda x: x[:5])
df['zipcode']=df['zipcode'].map(lambda x: x.zfill(5))
df['zipcode']=df['zipcode'].map(lambda x: x.replace("00nan", "NA"))

In [15]:
# transform categorical variables to dummies
df['property_type'] = df['property_type'].astype('category')
df['room_type'] = df['room_type'].astype('category')
df['bed_type'] = df['bed_type'].astype('category')
df['cancellation_policy'] = df['cancellation_policy'].astype('category')

for i in ['cancellation_policy', 'property_type', 'room_type', 'bed_type', 'zipcode', 'city']:
    df = pd.concat([df, pd.get_dummies(df[i])], axis=1)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74111 entries, 0 to 74110
Columns: 858 entries, log_price to SF
dtypes: bool(131), category(4), float64(5), int64(6), object(6), uint8(706)
memory usage: 69.1+ MB


In [17]:
df.head()

Unnamed: 0,log_price,property_type,room_type,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,city,first_review,...,94401,95202,99135,NA,Boston,Chicago,DC,LA,NYC,SF
0,5.010635,Apartment,Entire home/apt,3,1.0,Real Bed,strict,1,NYC,2016-06-18,...,0,0,0,0,0,0,0,0,1,0
1,5.129899,Apartment,Entire home/apt,7,1.0,Real Bed,strict,1,NYC,2017-08-05,...,0,0,0,0,0,0,0,0,1,0
2,4.976734,Apartment,Entire home/apt,5,1.0,Real Bed,moderate,1,NYC,2017-04-30,...,0,0,0,0,0,0,0,0,1,0
3,6.620073,House,Entire home/apt,4,1.0,Real Bed,flexible,1,SF,,...,0,0,0,0,0,0,0,0,0,1
4,4.744932,Apartment,Entire home/apt,2,1.0,Real Bed,moderate,1,DC,2015-05-12,...,0,0,0,0,0,0,1,0,0,0


In [18]:
df1 = df.dropna()

In [19]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47786 entries, 1 to 74110
Columns: 858 entries, log_price to SF
dtypes: bool(131), category(4), float64(5), int64(6), object(6), uint8(706)
memory usage: 44.9+ MB


In [20]:
df1 = df1.drop(['cancellation_policy', 'property_type', 'room_type', 'bed_type', 'zipcode', 'city'], axis=1)

In [21]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47786 entries, 1 to 74110
Columns: 852 entries, log_price to SF
dtypes: bool(131), float64(5), int64(6), object(4), uint8(706)
memory usage: 44.0+ MB


In [22]:
df1.head()

Unnamed: 0,log_price,accommodates,bathrooms,cleaning_fee,first_review,host_has_profile_pic,host_identity_verified,host_response_rate,host_since,instant_bookable,...,94401,95202,99135,NA,Boston,Chicago,DC,LA,NYC,SF
1,5.129899,7,1.0,1,2017-08-05,1,0,100%,2017-06-19,1,...,0,0,0,0,0,0,0,0,1,0
2,4.976734,5,1.0,1,2017-04-30,1,1,100%,2016-10-25,1,...,0,0,0,0,0,0,0,0,1,0
4,4.744932,2,1.0,1,2015-05-12,1,1,100%,2015-03-01,1,...,0,0,0,0,0,0,1,0,0,0
5,4.442651,2,1.0,1,2017-08-27,1,1,100%,2017-06-07,1,...,0,0,0,0,0,0,0,0,0,1
6,4.418841,3,1.0,1,2017-03-10,1,0,100%,2017-03-03,1,...,0,0,0,0,0,0,0,1,0,0


In [23]:
df1.describe()

Unnamed: 0,log_price,accommodates,bathrooms,cleaning_fee,host_has_profile_pic,host_identity_verified,instant_bookable,number_of_reviews,review_scores_rating,bedrooms,...,94401,95202,99135,NA,Boston,Chicago,DC,LA,NYC,SF
count,47786.0,47786.0,47786.0,47786.0,47786.0,47786.0,47786.0,47786.0,47786.0,47786.0,...,47786.0,47786.0,47786.0,47786.0,47786.0,47786.0,47786.0,47786.0,47786.0,47786.0
mean,4.751305,3.325032,1.239767,0.822082,0.998472,0.73103,0.293266,30.783514,94.227472,1.28161,...,0.0,2.1e-05,0.0,0.011259,0.051668,0.064224,0.073557,0.321015,0.405998,0.083539
std,0.675713,2.236295,0.582534,0.382448,0.039056,0.443429,0.455264,43.375286,7.215772,0.865018,...,0.0,0.004575,0.0,0.105508,0.221358,0.245154,0.261052,0.466871,0.491089,0.276698
min,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,20.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.304065,2.0,1.0,1.0,1.0,0.0,0.0,5.0,92.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4.70048,2.0,1.0,1.0,1.0,1.0,0.0,14.0,96.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,5.170484,4.0,1.0,1.0,1.0,1.0,1.0,39.0,100.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
max,7.600402,16.0,8.0,1.0,1.0,1.0,1.0,605.0,100.0,10.0,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [24]:
X = df1.values[:,1:846]
y = df1.values[:,0]
X.shape

(47786, 845)

In [18]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.feature_selection import SelectFromModel

clf = ExtraTreesRegressor()
clf = clf.fit(X, y)
clf.feature_importances_  

array([4.51407397e-02, 4.56800489e-02, 8.60439675e-03, 8.70265661e-04,
       6.69432777e-03, 5.90275792e-03, 5.37635123e-02, 1.76046245e-02,
       0.00000000e+00, 8.31292644e-05, 4.35362016e-03, 4.18162349e-05,
       3.86334315e-05, 4.89999660e-03, 5.17195369e-06, 3.02742941e-04,
       9.94792156e-05, 4.06303567e-05, 1.64370047e-04, 0.00000000e+00,
       1.12275281e-03, 2.80072524e-06, 4.91562212e-05, 7.47101248e-05,
       6.03177623e-04, 0.00000000e+00, 3.94423015e-03, 4.69556614e-03,
       6.64514777e-03, 5.17853408e-03, 1.69231792e-03, 7.23651385e-05,
       4.10500801e-04, 1.77666017e-04, 8.40527860e-05, 3.79123691e-04,
       3.23001444e-04, 1.20826487e-04, 7.88157846e-06, 2.24988560e-04,
       5.62339220e-04, 1.81974039e-03, 3.31842026e-03, 2.25994399e-04,
       4.41041395e-03, 1.24252359e-05, 8.89536251e-03, 1.27741717e-03,
       4.46908271e-03, 2.81843811e-04, 5.37467759e-04, 5.39332280e-03,
       5.12833936e-03, 1.08505111e-04, 7.03225015e-07, 3.95159062e-05,
      

In [19]:
model = SelectFromModel(clf, prefit=True)
X_new = model.transform(X)
X_new.shape

(73766, 94)

In [20]:
#Standardizing the features
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
sc.fit(X_new)
X_std = sc.transform(X_new)
print('Shape before standardizing:', X_new.shape)
print('Shape after standardizing:', X_std.shape)

Shape before standardizing: (73766, 94)
Shape after standardizing: (73766, 94)


In [19]:
# To write a Python 2/3 compatible codebase, the first step is to add this line to the top of each module
from __future__ import division, print_function, unicode_literals

%matplotlib inline
# Import necessary libraries and specify that graphs should be plotted inline. 
from sklearn import linear_model # The sklearn.linear_model module implements generalized linear models
import numpy as np # NumPy is the package for scientific computing with Python


def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and training learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - An object to be used as a cross-validation generator.
          - An iterable yielding train/test splits.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
    """
    plt.figure()                    #display figure
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples") #y label title
    plt.ylabel("Score")             #x label title
    
    # Class learning_curve determines cross-validated training and test scores for different training set sizes
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    
    # Cross validation statistics for training and testing data (mean and standard deviation)
    train_scores_mean = np.mean(train_scores, axis=1) # Compute the arithmetic mean along the specified axis.
    train_scores_std = np.std(train_scores, axis=1) # Compute the standard deviation along the specified axis.
    test_scores_mean = np.mean(test_scores, axis=1) # Compute the arithmetic mean along the specified axis.
    test_scores_std = np.std(test_scores, axis=1)   # Compute the standard deviation along the specified axis.

    plt.grid() # Configure the grid lines

    # Fill the area around the line to indicate the size of standard deviations for the training data
    # and the test data, the smaller the area the lesser the standard deviation hence the better the model
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r") # train data performance indicated with red
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g") # test data performance indicated with green
    
    # Cross-validation means indicated by dots
    # Train data performance indicated with red
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    # Test data performance indicated with green
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best") #Show legend of the plot at the best location possible
    return plt # function that returns the plot as an output

In [None]:
# Determines cross-validated training and test scores for different training set sizes
#from sklearn.model_selection import learning_curve 
# Random permutation cross-validator
#from sklearn.model_selection import ShuffleSplit
#importing the Knn Regressor
#from sklearn.neighbors import KNeighborsRegressor
# plots some lines in a plotting area, decorates the plot with labels, etc
#import matplotlib.pyplot as plt

#title = "Learning Curve (kNN)"

# Plots the learning curve based on the previously defined function for the kNN classifier. Uses the 
# random permutation cross-validator
#cv = ShuffleSplit(n_splits=10, test_size=0.3, random_state=42)
#estimator = KNeighborsRegressor()
#plot_learning_curve(estimator, title, X_std, y, (0.5, 1.01), cv=cv, n_jobs=4)

#plt.show() # Display the figure

In [22]:
# grid-search over a parameter grid.
from sklearn.model_selection import GridSearchCV #http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
from sklearn.neighbors import KNeighborsRegressor

# Choosing k for kNN AND type of distance
gs_knn = GridSearchCV(estimator=KNeighborsRegressor(p=2, 
                           metric='minkowski'),
                  param_grid=[{'n_neighbors': [15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51],
                               'weights':['uniform','distance']}],
                  scoring='neg_mean_squared_error',
                  cv=5,
                  n_jobs=20)
gs_knn = gs_knn.fit(X_std, y)          
print("\nBest knn model with no. of neighbors & type of weight")
print(gs_knn.best_score_)
print(gs_knn.best_params_)
print(gs_knn.best_estimator_)


Best knn model with no. of neighbors & type of weight
-0.21767571847128386
{'n_neighbors': 19, 'weights': 'distance'}
KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=19, p=2,
          weights='distance')


In [25]:
f=open('results.txt','w')


In [26]:
gs_knn.cv_results_



{'mean_fit_time': array([13.3823041 , 15.92011433, 18.50915394, 18.81133962, 12.69866304,
        13.12635894, 15.04856677, 14.46600909, 12.17481003, 13.13518744,
        13.53155808, 13.28531847, 11.88633356, 12.43348484, 12.97781444,
        13.32771158, 11.93871741, 12.48673267, 12.41730165, 12.46122036,
        12.19207768, 12.01080594, 12.53212085, 12.4685328 , 12.15120339,
        12.14850302, 12.40192103, 12.08336649, 12.1863214 , 12.20965343,
        12.29046307, 12.15842204, 12.09757953, 12.50708976, 12.54766078,
        12.58938251, 11.84578285, 11.93269701]),
 'std_fit_time': array([0.73218147, 1.79853146, 0.68789456, 0.40537846, 0.86631292,
        0.31790659, 1.11853715, 0.33158628, 0.33761048, 0.45021601,
        0.82896402, 0.60719606, 0.39494082, 0.58321047, 0.46949394,
        0.35224705, 0.39142227, 0.64728016, 0.42489308, 0.67104767,
        0.38221128, 0.40948014, 0.77365683, 0.52914529, 0.52895317,
        0.65520444, 0.28420666, 0.25229721, 0.47958974, 0.37880668,

In [27]:
gs_knn.error_score

'raise'

In [26]:
gs_knn.best_index_

NameError: name 'gs_knn' is not defined

In [30]:
gs_knn.error_score

'neg_mean_squared_error'

In [25]:
from sklearn.neighbors import KNeighborsRegressor

airbnb_knn = KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=19, p=2,
          weights='distance')