In [66]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score, accuracy_score, mean_squared_error
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression, Ridge

In [2]:
use_cols = ["neighbourhood_group",
            "room_type",
            "latitude",
            "longitude",
            "price",
            "minimum_nights",
            "number_of_reviews",
            "reviews_per_month",
            "calculated_host_listings_count",
            "availability_365"]
data = pd.read_csv("AB_NYC_2019.csv", usecols=use_cols)

In [3]:
data.head()

Unnamed: 0,neighbourhood_group,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,Brooklyn,40.64749,-73.97237,Private room,149,1,9,0.21,6,365
1,Manhattan,40.75362,-73.98377,Entire home/apt,225,1,45,0.38,2,355
2,Manhattan,40.80902,-73.9419,Private room,150,3,0,,1,365
3,Brooklyn,40.68514,-73.95976,Entire home/apt,89,1,270,4.64,1,194
4,Manhattan,40.79851,-73.94399,Entire home/apt,80,10,9,0.1,1,0


In [4]:
data.isna().sum()

neighbourhood_group                   0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In [5]:
data = data.fillna(0)

In [6]:
# Question 1

data["neighbourhood_group"].mode()

0    Manhattan
dtype: object

In [26]:
full_train, test = train_test_split(data, random_state=42, test_size=0.2)

In [27]:
train, val = train_test_split(full_train, random_state=42, test_size=0.25)

In [28]:
y_full_train = full_train["price"]
y_train = train["price"]
y_val = val["price"]
y_test = test["price"]

In [29]:
del full_train["price"]
del train["price"]
del val["price"]
del test["price"]

In [30]:
data.dtypes

neighbourhood_group                object
latitude                          float64
longitude                         float64
room_type                          object
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
dtype: object

In [31]:
# Question 2

categorical = ["neighbourhood_group", "room_type"]
numerical = ["latitude",
             "longitude",
             "minimum_nights",
             "number_of_reviews",
             "reviews_per_month",
             "calculated_host_listings_count",
             "availability_365"]

train[numerical].corr().abs()

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
latitude,1.0,0.080301,0.027441,0.006246,0.007159,0.019375,0.005891
longitude,0.080301,1.0,0.06066,0.055084,0.134642,0.117041,0.083666
minimum_nights,0.027441,0.06066,1.0,0.07602,0.120703,0.118647,0.138901
number_of_reviews,0.006246,0.055084,0.07602,1.0,0.590374,0.073167,0.174477
reviews_per_month,0.007159,0.134642,0.120703,0.590374,1.0,0.048767,0.165376
calculated_host_listings_count,0.019375,0.117041,0.118647,0.073167,0.048767,1.0,0.225913
availability_365,0.005891,0.083666,0.138901,0.174477,0.165376,0.225913,1.0


Biggest correlation is between `number_of_reviews` and `reviews_per_month`

In [32]:
above_average_train = np.where(y_train>=152, 1, 0)
above_average_train

array([0, 0, 0, ..., 1, 0, 0])

In [33]:
# Question 3

round(mutual_info_score(above_average_train, train["neighbourhood_group"]), 2)

0.05

In [34]:
round(mutual_info_score(above_average_train, train["room_type"]), 2)

0.14

`room_type` has a higher mutual information score, 0.14

In [35]:
# Question 4

def prepare_X(X):
    return dv.transform(X.to_dict(orient="records"))

def prepare_y(y):
    return np.where(y>=152, 1, 0)


dv = DictVectorizer().fit(train.to_dict(orient="records"))

prepared_train = dv.transform(train.to_dict(orient="records"))
prepared_train

<29337x15 sparse matrix of type '<class 'numpy.float64'>'
	with 264033 stored elements in Compressed Sparse Row format>

In [36]:
# just to check if the DictVectorizer did what it should:
pd.DataFrame(prepared_train.todense(), columns = dv.get_feature_names())

Unnamed: 0,availability_365,calculated_host_listings_count,latitude,longitude,minimum_nights,neighbourhood_group=Bronx,neighbourhood_group=Brooklyn,neighbourhood_group=Manhattan,neighbourhood_group=Queens,neighbourhood_group=Staten Island,number_of_reviews,reviews_per_month,room_type=Entire home/apt,room_type=Private room,room_type=Shared room
0,50.0,13.0,40.72760,-73.94495,3.0,0.0,1.0,0.0,0.0,0.0,29.0,0.70,1.0,0.0,0.0
1,7.0,1.0,40.70847,-74.00498,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.00,0.0,1.0,0.0
2,0.0,1.0,40.83149,-73.92766,40.0,1.0,0.0,0.0,0.0,0.0,0.0,0.00,1.0,0.0,0.0
3,0.0,1.0,40.66448,-73.99407,2.0,0.0,1.0,0.0,0.0,0.0,3.0,0.08,1.0,0.0,0.0
4,67.0,2.0,40.74118,-74.00012,1.0,0.0,0.0,1.0,0.0,0.0,48.0,1.80,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29332,0.0,1.0,40.71748,-73.95685,6.0,0.0,1.0,0.0,0.0,0.0,5.0,0.13,0.0,1.0,0.0
29333,0.0,2.0,40.66397,-73.98538,1.0,0.0,1.0,0.0,0.0,0.0,7.0,0.17,0.0,1.0,0.0
29334,88.0,1.0,40.79994,-73.97001,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.64,0.0,1.0,0.0
29335,0.0,1.0,40.69585,-73.96344,60.0,0.0,1.0,0.0,0.0,0.0,0.0,0.00,0.0,1.0,0.0


In [18]:
model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42, max_iter = 10000)

In [37]:
model.fit(prepared_train, above_average_train)

LogisticRegression(max_iter=10000, random_state=42)

In [41]:
prepare_X(val)

<9779x15 sparse matrix of type '<class 'numpy.float64'>'
	with 88011 stored elements in Compressed Sparse Row format>

In [42]:
y_pred_val = model.predict(prepare_X(val))
y_pred_val

array([0, 1, 0, ..., 0, 0, 1])

In [43]:
round(accuracy_score(y_pred_val, prepare_y(y_val)), 2)

0.79

In [44]:
# Question 5

features = numerical + categorical
features

['latitude',
 'longitude',
 'minimum_nights',
 'number_of_reviews',
 'reviews_per_month',
 'calculated_host_listings_count',
 'availability_365',
 'neighbourhood_group',
 'room_type']

In [50]:
accs = []

for feature in features:
    new_features = [i for i in features if i != feature]
    
    model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42, max_iter = 10000)
    model.fit(prepare_X(train[new_features]), prepare_y(y_train))
    
    y_pred = model.predict(prepare_X(val[new_features]))
    accs.append((feature, accuracy_score(y_pred, prepare_y(y_val))))

In [52]:
accs

[('latitude', 0.7867880151344718),
 ('longitude', 0.7869925350240311),
 ('minimum_nights', 0.7914919725943348),
 ('number_of_reviews', 0.7915942325391144),
 ('reviews_per_month', 0.790878412925657),
 ('calculated_host_listings_count', 0.7896512935883014),
 ('availability_365', 0.7812659781163718),
 ('neighbourhood_group', 0.7509970344616014),
 ('room_type', 0.7288066264444217)]

In [51]:
model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42, max_iter = 10000)
model.fit(prepare_X(train), prepare_y(y_train))
y_pred = model.predict(prepare_X(val))
original_acc = accuracy_score(y_pred, prepare_y(y_val))
original_acc

0.7910829328152162

In [54]:
accs_df = pd.DataFrame(accs, columns = ["feature", "accuracy"])
accs_df

Unnamed: 0,feature,accuracy
0,latitude,0.786788
1,longitude,0.786993
2,minimum_nights,0.791492
3,number_of_reviews,0.791594
4,reviews_per_month,0.790878
5,calculated_host_listings_count,0.789651
6,availability_365,0.781266
7,neighbourhood_group,0.750997
8,room_type,0.728807


In [61]:
accs_df["diffs"] = abs(accs_df["accuracy"] - original_acc)
accs_df

Unnamed: 0,feature,accuracy,diffs
0,latitude,0.786788,0.004295
1,longitude,0.786993,0.00409
2,minimum_nights,0.791492,0.000409
3,number_of_reviews,0.791594,0.000511
4,reviews_per_month,0.790878,0.000205
5,calculated_host_listings_count,0.789651,0.001432
6,availability_365,0.781266,0.009817
7,neighbourhood_group,0.750997,0.040086
8,room_type,0.728807,0.062276


In [62]:
accs_df.loc[accs_df["diffs"] == accs_df["diffs"].min(),:]

Unnamed: 0,feature,accuracy,diffs
4,reviews_per_month,0.790878,0.000205


reviews_per_month has the least impact on the model

In [73]:
# Question 6

alphas = [0, 0.01, 0.1, 1, 10]

for alpha in alphas:
    model = Ridge(alpha=alpha).fit(prepare_X(train), np.log1p(y_train))
    y_pred = model.predict(prepare_X(val))
    print(alpha, round(mean_squared_error(y_pred, np.log1p(y_val), squared=False), 3))

0 0.506
0.01 0.506
0.1 0.506
1 0.506
10 0.506


Since they're all the same, we go with the smallest alpha, in this case, 0