In [1]:
import csv
import pandas as pd
import numpy as np
df = pd.read_csv("train.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74111 entries, 0 to 74110
Data columns (total 29 columns):
id                        74111 non-null int64
log_price                 74111 non-null float64
property_type             74111 non-null object
room_type                 74111 non-null object
amenities                 74111 non-null object
accommodates              74111 non-null int64
bathrooms                 73911 non-null float64
bed_type                  74111 non-null object
cancellation_policy       74111 non-null object
cleaning_fee              74111 non-null bool
city                      74111 non-null object
description               74111 non-null object
first_review              58247 non-null object
host_has_profile_pic      73923 non-null object
host_identity_verified    73923 non-null object
host_response_rate        55812 non-null object
host_since                73923 non-null object
instant_bookable          74111 non-null object
last_review               582

In [2]:
# parsing the amenities variable
df['amenities'] = df['amenities'].str.replace('{', '')
df['amenities'] = df['amenities'].str.replace('}', '')
df['amenities'] = df['amenities'].str.replace('"', '')

# make a tuple to store all the amenities
amenities_all = np.unique(np.concatenate(df['amenities'].map(lambda x: x.split(","))))

# match each record's amenities
for amenity in amenities_all:
    df[amenity] = df['amenities'].map(lambda x: True if x.find(amenity) > 0 else False)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74111 entries, 0 to 74110
Columns: 160 entries, id to translation missing: en.hosting_amenity_50
dtypes: bool(132), float64(7), int64(3), object(18)
memory usage: 25.2+ MB


In [4]:
# exclude variables that are meaningless or are out of this project's scope
df = df.drop(['id', 'thumbnail_url', 'amenities', 'latitude', 'longitude', 'name', 'description', 'neighbourhood'], axis=1)

In [5]:
# exclude variables that do not exist at the initial listing
df = df.drop(['first_review', 'last_review', 'number_of_reviews', 'review_scores_rating', 'host_response_rate', 'host_since'], axis=1)

In [6]:
# transform true/false features to binary
df['host_has_profile_pic'] = df['host_has_profile_pic'].map(lambda x: 0 if x == "f" else 1)
df['host_identity_verified'] = df['host_identity_verified'].map(lambda x: 0 if x == "f" else 1)
df['instant_bookable'] = df['instant_bookable'].map(lambda x: 0 if x == "f" else 1)
df['cleaning_fee'] = df['cleaning_fee'].map(lambda x: 0 if x == False else 1)

In [7]:
# clean zipcode
df['zipcode'] = df['zipcode'].astype(str)
df['zipcode']=df['zipcode'].map(lambda x: x.strip('.0'))
df['zipcode']=df['zipcode'].map(lambda x: x.strip('lm'))
df['zipcode']=df['zipcode'].map(lambda x: x.strip('Near'))
df['zipcode']=df['zipcode'].map(lambda x: x.replace("\n", ""))
df['zipcode']=df['zipcode'].map(lambda x: x.replace("\r", ""))
df['zipcode']=df['zipcode'].map(lambda x: x.replace("-", ""))
df['zipcode']=df['zipcode'].map(lambda x: x.replace(" ", ""))

df['zipcode']=df['zipcode'].map(lambda x: x[:5])
df['zipcode']=df['zipcode'].map(lambda x: x.zfill(5))
df['zipcode']=df['zipcode'].map(lambda x: x.replace("00nan", "NA"))

In [8]:
# transform categorical variables to dummies
df['property_type'] = df['property_type'].astype('category')
df['room_type'] = df['room_type'].astype('category')
df['bed_type'] = df['bed_type'].astype('category')
df['cancellation_policy'] = df['cancellation_policy'].astype('category')

for i in ['cancellation_policy', 'property_type', 'room_type', 'bed_type', 'zipcode', 'city']:
    df = pd.concat([df, pd.get_dummies(df[i])], axis=1)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74111 entries, 0 to 74110
Columns: 852 entries, log_price to SF
dtypes: bool(131), category(4), float64(4), int64(5), object(2), uint8(706)
memory usage: 65.7+ MB


In [10]:
df.head()

Unnamed: 0,log_price,property_type,room_type,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,city,host_has_profile_pic,...,94401,95202,99135,NA,Boston,Chicago,DC,LA,NYC,SF
0,5.010635,Apartment,Entire home/apt,3,1.0,Real Bed,strict,1,NYC,1,...,0,0,0,0,0,0,0,0,1,0
1,5.129899,Apartment,Entire home/apt,7,1.0,Real Bed,strict,1,NYC,1,...,0,0,0,0,0,0,0,0,1,0
2,4.976734,Apartment,Entire home/apt,5,1.0,Real Bed,moderate,1,NYC,1,...,0,0,0,0,0,0,0,0,1,0
3,6.620073,House,Entire home/apt,4,1.0,Real Bed,flexible,1,SF,1,...,0,0,0,0,0,0,0,0,0,1
4,4.744932,Apartment,Entire home/apt,2,1.0,Real Bed,moderate,1,DC,1,...,0,0,0,0,0,0,1,0,0,0


In [11]:
df1 = df.dropna()

In [12]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 73766 entries, 0 to 74110
Columns: 852 entries, log_price to SF
dtypes: bool(131), category(4), float64(4), int64(5), object(2), uint8(706)
memory usage: 65.9+ MB


In [13]:
df1 = df1.drop(['cancellation_policy', 'property_type', 'room_type', 'bed_type', 'zipcode', 'city'], axis=1)

In [14]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 73766 entries, 0 to 74110
Columns: 846 entries, log_price to SF
dtypes: bool(131), float64(4), int64(5), uint8(706)
memory usage: 64.5 MB


In [15]:
df1.head()

Unnamed: 0,log_price,accommodates,bathrooms,cleaning_fee,host_has_profile_pic,host_identity_verified,instant_bookable,bedrooms,beds,Unnamed: 10,...,94401,95202,99135,NA,Boston,Chicago,DC,LA,NYC,SF
0,5.010635,3,1.0,1,1,1,0,1.0,1.0,False,...,0,0,0,0,0,0,0,0,1,0
1,5.129899,7,1.0,1,1,0,1,3.0,3.0,False,...,0,0,0,0,0,0,0,0,1,0
2,4.976734,5,1.0,1,1,1,1,1.0,3.0,False,...,0,0,0,0,0,0,0,0,1,0
3,6.620073,4,1.0,1,1,1,0,2.0,2.0,False,...,0,0,0,0,0,0,0,0,0,1
4,4.744932,2,1.0,1,1,1,1,0.0,1.0,False,...,0,0,0,0,0,0,1,0,0,0


In [16]:
df1.describe()

Unnamed: 0,log_price,accommodates,bathrooms,cleaning_fee,host_has_profile_pic,host_identity_verified,instant_bookable,bedrooms,beds,flexible,...,94401,95202,99135,NA,Boston,Chicago,DC,LA,NYC,SF
count,73766.0,73766.0,73766.0,73766.0,73766.0,73766.0,73766.0,73766.0,73766.0,73766.0,...,73766.0,73766.0,73766.0,73766.0,73766.0,73766.0,73766.0,73766.0,73766.0,73766.0
mean,4.782761,3.159518,1.23563,0.734891,0.996963,0.674091,0.26294,1.266369,1.71232,0.304137,...,1.4e-05,1.4e-05,1.4e-05,0.012892,0.046878,0.050362,0.076797,0.302944,0.436149,0.086869
std,0.717716,2.155023,0.582462,0.441394,0.055022,0.468717,0.440233,0.853104,1.255326,0.460044,...,0.003682,0.003682,0.003682,0.11281,0.211379,0.218692,0.266271,0.459535,0.49591,0.281645
min,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.317488,2.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4.718499,2.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,5.220356,4.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
max,7.600402,16.0,8.0,1.0,1.0,1.0,1.0,10.0,18.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [29]:
X = df1.values[:,1:846]
y = df1.values[:,0]
X.shape

(73766, 845)

In [30]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.feature_selection import SelectFromModel

clf = ExtraTreesRegressor()
clf = clf.fit(X, y)
clf.feature_importances_  

array([5.17894594e-02, 6.50022936e-02, 8.59809459e-03, 8.76433053e-04,
       6.46456119e-03, 5.75977435e-03, 3.94542418e-02, 1.19055656e-02,
       0.00000000e+00, 6.30230743e-05, 4.39969965e-03, 3.69264523e-05,
       4.74249757e-05, 4.83964644e-03, 2.22945250e-05, 2.41065570e-04,
       9.90322160e-05, 8.76245256e-05, 2.21767304e-04, 6.33442661e-07,
       1.04695939e-03, 1.12723354e-06, 3.72592838e-05, 5.15373137e-05,
       6.52591796e-04, 0.00000000e+00, 4.02603727e-03, 4.78477092e-03,
       6.67001389e-03, 5.21806321e-03, 1.62249034e-03, 5.48357806e-05,
       3.75912620e-04, 2.01837385e-04, 8.58588018e-05, 3.53231933e-04,
       3.02772113e-04, 1.58821906e-04, 5.54948600e-06, 2.90316697e-04,
       5.11512729e-04, 1.91028444e-03, 3.33963647e-03, 2.16693834e-04,
       3.55728649e-03, 1.71572891e-05, 8.77934178e-03, 1.26272244e-03,
       4.40891862e-03, 3.38512999e-04, 5.05401669e-04, 5.43566322e-03,
       5.12599617e-03, 1.20606269e-04, 1.51616005e-06, 2.88718523e-05,
      

In [31]:
model = SelectFromModel(clf, prefit=True)
X_new = model.transform(X)
X_new.shape

(73766, 94)

In [32]:
#Standardizing the features
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
sc.fit(X_new)
X_std = sc.transform(X_new)
print('Shape before standardizing:', X_new.shape)
print('Shape after standardizing:', X_std.shape)

Shape before standardizing: (73766, 94)
Shape after standardizing: (73766, 94)


In [19]:
# To write a Python 2/3 compatible codebase, the first step is to add this line to the top of each module
from __future__ import division, print_function, unicode_literals

%matplotlib inline
# Import necessary libraries and specify that graphs should be plotted inline. 
from sklearn import linear_model # The sklearn.linear_model module implements generalized linear models
import numpy as np # NumPy is the package for scientific computing with Python


def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and training learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - An object to be used as a cross-validation generator.
          - An iterable yielding train/test splits.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
    """
    plt.figure()                    #display figure
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples") #y label title
    plt.ylabel("Score")             #x label title
    
    # Class learning_curve determines cross-validated training and test scores for different training set sizes
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    
    # Cross validation statistics for training and testing data (mean and standard deviation)
    train_scores_mean = np.mean(train_scores, axis=1) # Compute the arithmetic mean along the specified axis.
    train_scores_std = np.std(train_scores, axis=1) # Compute the standard deviation along the specified axis.
    test_scores_mean = np.mean(test_scores, axis=1) # Compute the arithmetic mean along the specified axis.
    test_scores_std = np.std(test_scores, axis=1)   # Compute the standard deviation along the specified axis.

    plt.grid() # Configure the grid lines

    # Fill the area around the line to indicate the size of standard deviations for the training data
    # and the test data, the smaller the area the lesser the standard deviation hence the better the model
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r") # train data performance indicated with red
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g") # test data performance indicated with green
    
    # Cross-validation means indicated by dots
    # Train data performance indicated with red
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    # Test data performance indicated with green
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best") #Show legend of the plot at the best location possible
    return plt # function that returns the plot as an output

In [None]:
# Determines cross-validated training and test scores for different training set sizes
#from sklearn.model_selection import learning_curve 
# Random permutation cross-validator
#from sklearn.model_selection import ShuffleSplit
#importing the Knn Regressor
#from sklearn.neighbors import KNeighborsRegressor
# plots some lines in a plotting area, decorates the plot with labels, etc
#import matplotlib.pyplot as plt

#title = "Learning Curve (kNN)"

# Plots the learning curve based on the previously defined function for the kNN classifier. Uses the 
# random permutation cross-validator
#cv = ShuffleSplit(n_splits=10, test_size=0.3, random_state=42)
#estimator = KNeighborsRegressor()
#plot_learning_curve(estimator, title, X_std, y, (0.5, 1.01), cv=cv, n_jobs=4)

#plt.show() # Display the figure

In [33]:
# grid-search over a parameter grid.
from sklearn.model_selection import GridSearchCV #http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
from sklearn.neighbors import KNeighborsRegressor

# Choosing k for kNN AND type of distance
gs_knn = GridSearchCV(estimator=KNeighborsRegressor(p=2, 
                           metric='minkowski'),
                  param_grid=[{'n_neighbors': [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51],
                               'weights':['uniform','distance']}],
                  scoring='neg_mean_squared_error',
                  cv=10,
                  n_jobs=4)
gs_knn = gs_knn.fit(X_std, y)          
print("\nBest knn model with no. of neighbors & type of weight")
print(gs_knn.best_score_)
print(gs_knn.best_params_)
print(gs_knn.best_estimator_)

KeyboardInterrupt: 

In [None]:
#importing the Knn Regressor
#from sklearn.neighbors import KNeighborsRegressor
#from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
#from sklearn import neighbors

#knn_regressor = 