In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

data = pd.read_csv("AB_NYC_2019.csv")

In [2]:
columns = ["neighbourhood_group", "room_type", "latitude", "longitude", "price", "minimum_nights", "number_of_reviews",
          "reviews_per_month", "calculated_host_listings_count", "availability_365"]
data = data[columns]
data.head()

Unnamed: 0,neighbourhood_group,room_type,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,Brooklyn,Private room,40.64749,-73.97237,149,1,9,0.21,6,365
1,Manhattan,Entire home/apt,40.75362,-73.98377,225,1,45,0.38,2,355
2,Manhattan,Private room,40.80902,-73.9419,150,3,0,,1,365
3,Brooklyn,Entire home/apt,40.68514,-73.95976,89,1,270,4.64,1,194
4,Manhattan,Entire home/apt,40.79851,-73.94399,80,10,9,0.1,1,0


In [3]:
data.isna().sum()

neighbourhood_group                   0
room_type                             0
latitude                              0
longitude                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In [4]:
data.fillna(0, inplace = True)

In [5]:
data.isna().sum()

neighbourhood_group               0
room_type                         0
latitude                          0
longitude                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
dtype: int64

### Question 1
What is the most frequent observation (mode) for the column 'neighbourhood_group'?

**Split the data**
- Split your data in train/val/test sets, with 60%/20%/20% distribution.
- Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
- Make sure that the target value ('price') is not in your dataframe.

In [6]:
print("Mode of neighbourhood_group column:", data.neighbourhood_group.mode())

Mode of neighbourhood_group column: 0    Manhattan
dtype: object


In [7]:
from sklearn.model_selection import train_test_split

In [8]:
df_full_train, df_test = train_test_split(data, test_size = .2, random_state = 42)
df_train, df_val = train_test_split(df_full_train, test_size = .25, random_state = 42)

len(df_train), len(df_test), len(df_val)

(29337, 9779, 9779)

In [9]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [10]:
y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

del df_train["price"]
del df_val["price"]
del df_test["price"]

### Question 2
- Create the correlation matrix for the numerical features of your train dataset.
    - In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.
- What are the two features that have the biggest correlation in this dataset?

In [11]:
len(df_train)

29337

In [12]:
df_train.isna().sum()

neighbourhood_group               0
room_type                         0
latitude                          0
longitude                         0
minimum_nights                    0
number_of_reviews                 0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
dtype: int64

In [13]:
df_full_train.price.mean()

153.75158502914408

In [14]:
df_train.dtypes

neighbourhood_group                object
room_type                          object
latitude                          float64
longitude                         float64
minimum_nights                      int64
number_of_reviews                   int64
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
dtype: object

In [15]:
numerical = ["latitude", "longitude", "minimum_nights", "number_of_reviews",
            "reviews_per_month", "calculated_host_listings_count", "availability_365"]
categorical = ["neighbourhood_group", "room_type"]

In [16]:
df_train[numerical].corr()

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
latitude,1.0,0.080301,0.027441,-0.006246,-0.007159,0.019375,-0.005891
longitude,0.080301,1.0,-0.06066,0.055084,0.134642,-0.117041,0.083666
minimum_nights,0.027441,-0.06066,1.0,-0.07602,-0.120703,0.118647,0.138901
number_of_reviews,-0.006246,0.055084,-0.07602,1.0,0.590374,-0.073167,0.174477
reviews_per_month,-0.007159,0.134642,-0.120703,0.590374,1.0,-0.048767,0.165376
calculated_host_listings_count,0.019375,-0.117041,0.118647,-0.073167,-0.048767,1.0,0.225913
availability_365,-0.005891,0.083666,0.138901,0.174477,0.165376,0.225913,1.0


**Make price binary**
- We need to turn the price variable from numeric into binary.
- Let's create a variable above_average which is 1 if the price is above (or equal to) 152.

In [17]:
import numpy as np
#data['above_average'] = np.where(data['price'] >= 152, 1, 0)

In [18]:
above_average = np.where(y_train >= 152, 1, 0)

In [19]:
above_average

array([0, 0, 0, ..., 1, 0, 0])

In [20]:
df_full_train['above_average'] = np.where(df_full_train['price'] >= 152, 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [21]:
df_full_train

Unnamed: 0,neighbourhood_group,room_type,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,above_average
32645,Brooklyn,Entire home/apt,40.71577,-73.95530,295,3,11,0.87,1,1,1
23615,Manhattan,Private room,40.84917,-73.94048,70,2,2,0.16,1,0,0
31183,Brooklyn,Private room,40.68993,-73.95947,58,2,0,0.00,2,0,0
29260,Brooklyn,Entire home/apt,40.68427,-73.93118,75,3,87,4.91,1,267,0
7275,Queens,Private room,40.74705,-73.89564,38,5,13,0.25,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
11284,Manhattan,Shared room,40.84650,-73.94319,60,1,0,0.00,1,0,0
44732,Manhattan,Private room,40.73957,-74.00082,85,2,4,1.90,1,76,0
38158,Manhattan,Entire home/apt,40.78318,-73.97372,130,30,1,0.34,5,261,0
860,Manhattan,Entire home/apt,40.77508,-73.97990,150,2,11,0.13,1,2,0


### Question 3
- Calculate the mutual information score with the (binarized) price for the two categorical variables that we have. - - Use the training set only.
- Which of these two variables has bigger score?
- Round it to 2 decimal digits using round(score, 2)

**Mutual Information-** the mutual information (MI) of two random variables is a measure of the mutual dependence between the two variables. More specifically, it quantifies the "amount of information" obtained about one random variable by observing the other random variable. The concept of mutual information is intimately linked to that of entropy of a random variable, a fundamental notion in information theory that quantifies the expected "amount of information" held in a random variable.

In [22]:
from sklearn.metrics import mutual_info_score

In [23]:
price_bin = pd.DataFrame(above_average, columns = ["price"])

In [24]:
round(mutual_info_score(price_bin.price, df_train.neighbourhood_group), 2)

0.05

In [25]:
round(mutual_info_score(price_bin.price, df_train.room_type), 2)

0.14

In [26]:
def mutual_info_price_score(series):
    return mutual_info_score(series, price_bin.price)

In [27]:
mi = df_train[categorical].apply(mutual_info_price_score)
mi.sort_values()

neighbourhood_group    0.046506
room_type              0.143226
dtype: float64

### Question 4
- Now let's train a logistic regression
- Remember that we have two categorical variables in the data. Include them using one-hot encoding.
- Fit the model on the training dataset.
- To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
    model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42)
- Calculate the accuracy on the validation dataset and rount it to 2 decimal digits.

In [28]:
df_full_train[numerical].corrwith(df_full_train["price"]).abs()

latitude                          0.035015
longitude                         0.149080
minimum_nights                    0.042740
number_of_reviews                 0.048926
reviews_per_month                 0.051978
calculated_host_listings_count    0.055336
availability_365                  0.080562
dtype: float64

In [29]:
from sklearn.feature_extraction import DictVectorizer

In [30]:
#numerical = ["latitude", "longitude", "minimum_nights", "number_of_reviews",
 #           "reviews_per_month", "calculated_host_listings_count", "availability_365"]
#categorical = ["neighbourhood_group", "room_type"]

dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [31]:
from sklearn.linear_model import LogisticRegression

In [32]:
model = LogisticRegression(solver='lbfgs')
# solver='lbfgs' is the default solver in newer version of sklearn
# for older versions, you need to specify it explicitly
model.fit(X_train, above_average)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [33]:
y_pred = model.predict_proba(X_val)[:, 1]
price_decision = (y_pred >= 0.5)
(y_val == price_decision).mean()

0.00010225994477962981

In [34]:
y_val

array([ 65,  89, 200, ...,  39,  70,  77])

In [35]:
y_pred

array([0.03272889, 0.5692811 , 0.39651399, ..., 0.09910689, 0.03407817,
       0.63982237])

In [36]:
y_val_ = np.where(y_val >= 152, 1, 0)

In [37]:
y_pred_ = np.where(y_pred >= 0.5, 1, 0)

In [38]:
from sklearn.metrics import accuracy_score
org_score = round(accuracy_score(y_val_, y_pred_), 2)
org_score

0.79

### Question 5
- We have 9 features: 7 numerical features and 2 categorical.
Let's find the least useful one using the feature elimination technique.
Train a model with all these features (using the same parameters as in Q4).
Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
For each feature, calculate the difference between the original accuracy and the accuracy without the feature.

Which of following feature has the smallest difference?
- neighbourhood_group
- room_type
- number_of_reviews
- reviews_per_month
note: the difference doesn't have to be positive

In [39]:
acc_diff = []
feats = categorical+numerical
for feat in feats:
    dv = DictVectorizer(sparse=False)
    cop = feats.copy()
    cop.remove(feat)
    train_dict = df_train[cop].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)

    val_dict = df_val[cop].to_dict(orient='records')
    X_val = dv.transform(val_dict)
    
    model = LogisticRegression(solver='lbfgs')
    model.fit(X_train, above_average)
    
    y_pred = model.predict_proba(X_val)[:, 1]
    y_pred_ = np.where(y_pred >= 0.5, 1, 0)
    
    score = round(accuracy_score(y_val_, y_pred_), 2)
    diff = round(org_score - score, 2)
    acc_diff.append(diff)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [40]:
acc_diff

[0.04, 0.07, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01]

In [41]:
feats

['neighbourhood_group',
 'room_type',
 'latitude',
 'longitude',
 'minimum_nights',
 'number_of_reviews',
 'reviews_per_month',
 'calculated_host_listings_count',
 'availability_365']

### Question 6
For this question, we'll see how to use a linear regression model from Scikit-Learn
We'll need to use the original column 'price'. Apply the logarithmic transformation to this column.
Fit the Ridge regression model on the training data.
This model has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10]
Which of these alphas leads to the best RMSE on the validation set? Round your RMSE scores to 3 decimal digits.
If there are multiple options, select the smallest alpha.

In [47]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [65]:
y_train_log = np.log(1 + y_train)

In [70]:
y_val_log = np.log(1 + y_val)

In [66]:
y_train_log

array([4.60517019, 4.06044301, 4.26267988, ..., 5.70378247, 4.18965474,
       4.53259949])

In [71]:
rmse = []
for i in [0, 0.01, 0.1, 1, 10]:
    clf = Ridge(alpha=i)
    clf.fit(X_train, y_train_log)
    
    y_pred = model.predict(X_val)
    
    rms = round(mean_squared_error(y_val_log, y_pred, squared=False), 3)
    rmse.append(rms)

In [72]:
rmse

[4.511, 4.511, 4.511, 4.511, 4.511]