# Machine Learning 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors, KNeighborsRegressor
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

# Data Cleaning 

In [2]:
def price_to_float(price):
    price = float(price.replace('$','').replace(',',''))
    return price

In [3]:
def outlier(df, column):
    df[column] = df[column]._get_numeric_data()
    q1 = df[column].quantile(0.25)
    print(q1)
    q3 = df[column].quantile(0.75)
    print(q3)
    iqr = q3 - q1

    lower_bound = q1 -(1.5 * iqr) 
    upper_bound = q3 +(1.5 * iqr)

    df = df.drop(df[df[column] < lower_bound].index)
    df = df.drop(df[df[column] > upper_bound].index)
            
    
    return(df)


In [4]:
listings = pd.read_csv('/Users/victorkausch/Desktop/Data Analytics/listings.csv')
original = listings.copy()

In [5]:
listings['price'] = listings['price'].map(price_to_float)

In [6]:
ATTRS = ['beds', 'bedrooms', 'room_type', 'amenities', 'price']

In [7]:
df = listings[ATTRS]

In [8]:
# check number of missing values per column
for col in df.columns:
    print(col)
    print(pd.isnull(df[col]).sum())

beds
29
bedrooms
23
room_type
0
amenities
0
price
0


In [9]:
df.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


# Feature Engineering

In [10]:
df.loc[11,"amenities"]
# .loc function calls on the row number and a specific column 

'{TV,Wifi,Kitchen,Elevator,"Indoor fireplace",Heating,"Family/kid friendly",Washer,"Smoke detector",Essentials,Hangers,Iron,"High chair","Children’s books and toys","Pack ’n Play/travel crib","Hot water",Refrigerator,"Dishes and silverware","Long term stays allowed"}'

In [11]:
am = df.loc[11,"amenities"]

In [12]:
am.replace("{","").replace("}","").split(",")

['TV',
 'Wifi',
 'Kitchen',
 'Elevator',
 '"Indoor fireplace"',
 'Heating',
 '"Family/kid friendly"',
 'Washer',
 '"Smoke detector"',
 'Essentials',
 'Hangers',
 'Iron',
 '"High chair"',
 '"Children’s books and toys"',
 '"Pack ’n Play/travel crib"',
 '"Hot water"',
 'Refrigerator',
 '"Dishes and silverware"',
 '"Long term stays allowed"']

In [13]:
def clean_amenities(a):
    return a.replace("{","").replace("}","").split(",")

# function for cleaning the list within the amenities column 

In [14]:
df["amenities_list"] = df["amenities"].map(clean_amenities)

#map function copies the previous above mentioned function onto the whole column 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [15]:
df["amenities_list"].values

array([list(['"Cable TV"', 'Internet', 'Wifi', '"Free street parking"', 'Heating', '"Family/kid friendly"', 'Washer', '"Smoke detector"', 'Essentials', '"Lock on bedroom door"', '"Hair dryer"', '"Hot water"', '"Host greets you"']),
       list(['TV', '"Cable TV"', 'Wifi', 'Kitchen', 'Gym', 'Heating', '"Family/kid friendly"', '"Smoke detector"', 'Essentials', 'Shampoo', '"Lock on bedroom door"', 'Hangers', '"Hair dryer"', 'Iron', '"Laptop friendly workspace"', '"Self check-in"', 'Lockbox', '"Private living room"', 'Bathtub', '"Hot water"', '"Bed linens"', '"Extra pillows and blankets"', 'Microwave', '"Coffee maker"', 'Refrigerator', 'Dishwasher', '"Dishes and silverware"', '"Cooking basics"', 'Stove', '"Luggage dropoff allowed"', '"Long term stays allowed"']),
       list(['Internet', 'Wifi', 'Kitchen', '"Buzzer/wireless intercom"', 'Heating', '"Family/kid friendly"', 'Washer', '"Smoke detector"', '"Carbon monoxide detector"', '"First aid kit"', '"Safety card"', '"Fire extinguisher"', '

In [16]:
def flatten_nested_list(nested_list):
    return [item for sub_list in nested_list for item in sub_list]

In [17]:
amenities_flat = flatten_nested_list(df["amenities_list"].values)

In [18]:
amenities_flat

['"Cable TV"',
 'Internet',
 'Wifi',
 '"Free street parking"',
 'Heating',
 '"Family/kid friendly"',
 'Washer',
 '"Smoke detector"',
 'Essentials',
 '"Lock on bedroom door"',
 '"Hair dryer"',
 '"Hot water"',
 '"Host greets you"',
 'TV',
 '"Cable TV"',
 'Wifi',
 'Kitchen',
 'Gym',
 'Heating',
 '"Family/kid friendly"',
 '"Smoke detector"',
 'Essentials',
 'Shampoo',
 '"Lock on bedroom door"',
 'Hangers',
 '"Hair dryer"',
 'Iron',
 '"Laptop friendly workspace"',
 '"Self check-in"',
 'Lockbox',
 '"Private living room"',
 'Bathtub',
 '"Hot water"',
 '"Bed linens"',
 '"Extra pillows and blankets"',
 'Microwave',
 '"Coffee maker"',
 'Refrigerator',
 'Dishwasher',
 '"Dishes and silverware"',
 '"Cooking basics"',
 'Stove',
 '"Luggage dropoff allowed"',
 '"Long term stays allowed"',
 'Internet',
 'Wifi',
 'Kitchen',
 '"Buzzer/wireless intercom"',
 'Heating',
 '"Family/kid friendly"',
 'Washer',
 '"Smoke detector"',
 '"Carbon monoxide detector"',
 '"First aid kit"',
 '"Safety card"',
 '"Fire exti

In [19]:
from collections import Counter

In [20]:
Counter(["a", "b", "c", "a", "d"])

Counter({'a': 2, 'b': 1, 'c': 1, 'd': 1})

In [21]:
amenities_count = Counter(amenities_flat)

In [22]:
amenities_count

Counter({'': 61,
         ' toilet"': 304,
         '"24-hour check-in"': 1218,
         '"Accessible-height bed"': 608,
         '"Accessible-height toilet"': 218,
         '"Air conditioning"': 490,
         '"Air purifier"': 2,
         '"BBQ grill"': 510,
         '"Baby bath"': 368,
         '"Baby monitor"': 147,
         '"Babysitter recommendations"': 305,
         '"Bath towel"': 43,
         '"Bathroom essentials"': 48,
         '"Bathtub with bath chair"': 30,
         '"Beach essentials"': 93,
         '"Beach view"': 1,
         '"Bed linens"': 6746,
         '"Bedroom comforts"': 48,
         '"Body soap"': 43,
         '"Breakfast table"': 11,
         '"Building staff"': 368,
         '"Buzzer/wireless intercom"': 5487,
         '"Cable TV"': 3658,
         '"Carbon monoxide detector"': 2573,
         '"Ceiling fan"': 1,
         '"Ceiling hoist"': 1,
         '"Changing table"': 364,
         '"Children’s books and toys"': 1365,
         '"Children’s dinnerware"': 654,

In [23]:
# number of unique words in original amenities
len(amenities_count)

176

In [24]:
# sort by value (i.e. number of occurences) in reverse order, i.e. largest  numbers first
sorted(amenities_count.items(), key= lambda x: x[1], reverse=True)

[('Wifi', 23302),
 ('Heating', 22953),
 ('Kitchen', 22871),
 ('Essentials', 22201),
 ('Washer', 19808),
 ('"Hair dryer"', 16656),
 ('"Laptop friendly workspace"', 16052),
 ('Hangers', 15695),
 ('Iron', 12663),
 ('Shampoo', 11795),
 ('"Hot water"', 11366),
 ('TV', 11042),
 ('"Smoke detector"', 8620),
 ('Refrigerator', 7666),
 ('"Dishes and silverware"', 7403),
 ('"Family/kid friendly"', 7180),
 ('Internet', 7113),
 ('"Cooking basics"', 6931),
 ('Stove', 6930),
 ('"Bed linens"', 6746),
 ('"Lock on bedroom door"', 6357),
 ('"Host greets you"', 6294),
 ('Oven', 6229),
 ('"Free street parking"', 5558),
 ('Elevator', 5539),
 ('"Buzzer/wireless intercom"', 5487),
 ('"Coffee maker"', 5383),
 ('"Smoking allowed"', 4606),
 ('"First aid kit"', 4576),
 ('Dishwasher', 4573),
 ('Dryer', 3857),
 ('"Cable TV"', 3658),
 ('"Long term stays allowed"', 3619),
 ('"Private entrance"', 3597),
 ('"Fire extinguisher"', 3585),
 ('"translation missing: en.hosting_amenity_50"', 3497),
 ('"Patio or balcony"', 3451

In [25]:
main_amenities = ['Wifi', 'Heating', 'Kitchen']

In [26]:
def is_in_amenities(key, a):
    if key in a:
        return 1
    else:
        return 0

In [27]:
for a in main_amenities:
    df["has_" + a] = df["amenities_list"].map(lambda x: is_in_amenities(a, x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [None]:
# put it all together

def clean_amenities(a):
        return a.replace("{","").replace("}","").split(",")
    
def is_in_amenities(a, amenities):
    """Return 1 if a single amenity, e.g. 'Wifi', is present in a list of amenities. 
    Apply this function using map to a pandas column"""
    if a in amenities:
        return 1
    else:
        return 0
    
def extract_amenities(df, amenities="all"):
    """Add to your dataframe df columns indicating whether a certain amenity is represented in the apartment (row).
    Specify the amenities of interest using pthe parameter amenities, e.g. amenities = ['Wifi', 'Heating', 'Kitchen'].
    >>>extract_amenities(df, amenities = ['Wifi', 'Heating', 'Kitchen'])
    """
    df["amenities_list"] = df["amenities"].map(clean_amenities)
    
    if amenities == "all":
        all_amenities_flat = flatten_nested_list(df["amenities_list"].values)
        amenities = list(set(all_amenities_flat))
        
    for a in amenities:
        df["has_" + a] = df["amenities_list"].map(lambda x: is_in_amenities(a, x))
        
    return df.drop(["amenities_list", "amenities"])

def count_amenities(df):
    """Compute for each amenity the number of occurences in the entire dataset"""
    df["amenities_list"] = df["amenities"].map(clean_amenities)
    all_amenities_flat = flatten_nested_list(df["amenities_list"].values)
    all_amenities_count = Counter(all_amenities_flat)
    
    return sorted(amenities_count.items(), key= lambda x: x[1], reverse=True)
    

In [28]:
df.sample(10)

Unnamed: 0,beds,bedrooms,room_type,amenities,price,amenities_list,has_Wifi,has_Heating,has_Kitchen
19760,4.0,4.0,Entire home/apt,"{TV,Wifi,""Air conditioning"",Kitchen,""Free park...",1350.0,"[TV, Wifi, ""Air conditioning"", Kitchen, ""Free ...",1,1,1
11096,1.0,1.0,Private room,"{Wifi,Kitchen,""Pets allowed"",Heating,Washer,""S...",15.0,"[Wifi, Kitchen, ""Pets allowed"", Heating, Washe...",1,1,1
10810,1.0,1.0,Entire home/apt,"{TV,Wifi,Kitchen,Washer,Dryer,Essentials,""Hair...",100.0,"[TV, Wifi, Kitchen, Washer, Dryer, Essentials,...",1,0,1
1648,1.0,1.0,Entire home/apt,"{TV,""Cable TV"",Internet,Wifi,Kitchen,Elevator,...",95.0,"[TV, ""Cable TV"", Internet, Wifi, Kitchen, Elev...",1,1,1
23883,1.0,1.0,Private room,"{Internet,Wifi,Kitchen,""Pets live on this prop...",95.0,"[Internet, Wifi, Kitchen, ""Pets live on this p...",1,1,1
11229,1.0,1.0,Private room,"{TV,Wifi,""Air conditioning"",Kitchen,""Free park...",25.0,"[TV, Wifi, ""Air conditioning"", Kitchen, ""Free ...",1,1,1
7686,1.0,1.0,Entire home/apt,"{Internet,Wifi,Kitchen,""Buzzer/wireless interc...",30.0,"[Internet, Wifi, Kitchen, ""Buzzer/wireless int...",1,1,1
15843,1.0,1.0,Entire home/apt,"{Wifi,Kitchen,""Smoking allowed"",Heating,Washer...",35.0,"[Wifi, Kitchen, ""Smoking allowed"", Heating, Wa...",1,1,1
3652,5.0,1.0,Entire home/apt,"{TV,Internet,Wifi,Kitchen,Heating,""Family/kid ...",70.0,"[TV, Internet, Wifi, Kitchen, Heating, ""Family...",1,1,1
3358,1.0,1.0,Private room,"{Wifi,Kitchen,Heating,Washer,Essentials,Hanger...",39.0,"[Wifi, Kitchen, Heating, Washer, Essentials, H...",1,1,1


In [29]:
# remove old amenities columns containing strings or lists of strings
df.drop(["amenities", "amenities_list"], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [30]:
df.sample(5)

Unnamed: 0,beds,bedrooms,room_type,price,has_Wifi,has_Heating,has_Kitchen
22569,2.0,1.0,Entire home/apt,85.0,1,1,1
3148,1.0,1.0,Private room,45.0,1,1,1
13667,1.0,1.0,Private room,25.0,1,1,0
8134,1.0,1.0,Private room,40.0,1,1,1
19310,1.0,0.0,Entire home/apt,60.0,1,1,1


In [31]:
df = pd.get_dummies(data=df, columns=['room_type'], drop_first=True)

In [32]:
df = outlier(df, 'price')
df = outlier(df, 'beds')
df = outlier(df, 'bedrooms')
# we don't need to handle Boolean columns here

33.0
75.0
1.0
2.0
1.0
1.0


## Split in `X` (features) and `y` (target)

In [33]:
X = df.drop('price', axis = 1)
y = df['price'].values

# Train-Test-Split

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=10)

In [35]:
min_max_scaler = preprocessing.MinMaxScaler()
X_train_scaled = min_max_scaler.fit_transform(X_train)
X_test_scaled = min_max_scaler.fit_transform(X_test)

In [36]:
y_train_scaled = min_max_scaler.fit_transform(y_train.reshape(-1,1))
y_test_scaled = min_max_scaler.fit_transform(y_test.reshape(-1,1))

In [37]:
print (X_train.shape, y_train.shape)

(13100, 8) (13100,)


In [38]:
print (X_test.shape, y_test.shape)

(4367, 8) (4367,)


# K-Nearest-Neighbor

In [39]:
knn = KNeighborsRegressor(n_neighbors=3)

In [40]:
knn.fit(X_train_scaled, y_train_scaled)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=3, p=2,
          weights='uniform')

In [41]:
y_pred = knn.predict(X_test_scaled)

In [42]:
mean_squared_error(y_test_scaled, y_pred)

0.027157116568228064

In [43]:
y_test_scaled

array([[0.12307692],
       [0.13076923],
       [0.15384615],
       ...,
       [0.46923077],
       [0.13076923],
       [0.28461538]])

In [44]:
pd.DataFrame([y_test_scaled[:,0], y_pred[:,0]]).transpose().sample(10)

Unnamed: 0,0,1
343,0.161538,0.223844
1064,0.323077,0.484185
2701,0.415385,0.377129
1231,0.476923,0.484185
3713,0.461538,0.389294
572,0.323077,0.389294
3354,0.084615,0.255474
4309,0.684615,0.23601
3838,0.146154,0.23601
1028,0.861538,0.377129


# Linear Regression

In [45]:
reg = LinearRegression().fit(X_train_scaled, y_train_scaled)

In [46]:
reg.coef_

array([[ 0.08740903,  0.        ,  0.04194557,  0.014859  , -0.02341873,
         0.05825603, -0.188203  , -0.225258  ]])

In [47]:
reg.intercept_

array([0.39751228])

In [48]:
y_pred_lin = reg.predict(X_test_scaled)

In [49]:
mean_squared_error(y_test_scaled, y_pred_lin)

0.025763208067694547

## Cross Validation

In [56]:
from sklearn.model_selection import cross_validate, cross_val_score

In [57]:
knn = KNeighborsRegressor(n_neighbors=5)

In [58]:
scores = cross_val_score(knn, X, y, cv=10)

In [59]:
scores

array([ 0.03587591, -0.16187952, -0.14419934, -0.14959881,  0.0640713 ,
        0.04767405,  0.00758319,  0.16788745,  0.16095842, -0.05125026])

In [60]:
scores = cross_validate(knn, X, y, cv=10,return_train_score=True)

In [61]:
scores

{'fit_time': array([0.23000979, 0.17658281, 0.1830349 , 0.17628479, 0.17119789,
        0.17444992, 0.17035198, 0.17194986, 0.17064118, 0.17143679]),
 'score_time': array([0.17886424, 0.18379498, 0.19076824, 0.20095325, 0.18097711,
        0.17920709, 0.1826458 , 0.1721971 , 0.17640901, 0.16705823]),
 'test_score': array([ 0.03587591, -0.16187952, -0.14419934, -0.14959881,  0.0640713 ,
         0.04767405,  0.00758319,  0.16788745,  0.16095842, -0.05125026]),
 'train_score': array([ 0.15903414, -0.00811358,  0.07091511, -0.05353474,  0.14449745,
         0.12798282,  0.08557728,  0.07444454,  0.07257293, -0.21943668])}

In [62]:
lr = LinearRegression()


In [63]:
scores = cross_validate(lr, X, y, cv=10,return_train_score=True)

In [64]:
scores

{'fit_time': array([0.01040888, 0.0071969 , 0.00385714, 0.0053277 , 0.00626612,
        0.00432825, 0.00296497, 0.00298023, 0.00306582, 0.00274587]),
 'score_time': array([0.00111318, 0.00074601, 0.00090098, 0.00061917, 0.00103092,
        0.00053787, 0.00069594, 0.00048685, 0.00158215, 0.00046515]),
 'test_score': array([0.25533579, 0.27824243, 0.2836433 , 0.31173482, 0.28069107,
        0.29517117, 0.27294569, 0.30223836, 0.31284163, 0.24053055]),
 'train_score': array([0.30696281, 0.30760132, 0.30498642, 0.3043179 , 0.30727015,
        0.30456   , 0.30841998, 0.30323026, 0.30270381, 0.30885756])}

In [65]:
train_scores = scores['train_score']
test_score = scores['test_score']

In [66]:
train_scores.mean()

0.3058910214955769

In [67]:
test_score.mean()

0.2833374802577488

# Decision Trees

In [5]:
import pydotplus, graphviz
from sklearn.externals.six import StringIO  
from IPython.display import Image