In [3]:
%config Completer.use_jedi = False

In [13]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

plt.rcParams["figure.figsize"] = [10,8]

## 3.15 Homework

### Dataset

In this homework, we will continue the New York City Airbnb Open Data. You can take it from
[Kaggle](https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data?select=AB_NYC_2019.csv)
or download from [here](https://raw.githubusercontent.com/alexeygrigorev/datasets/master/AB_NYC_2019.csv)
if you don't want to sign up to Kaggle.

We'll keep working with the `'price'` variable, and we'll transform it to a classification task.


### Features

For the rest of the homework, you'll need to use the features from the previous homework with additional two `'neighbourhood_group'` and `'room_type'`. So the whole feature set will be set as follows:

* `'neighbourhood_group'`,
* `'room_type'`,
* `'latitude'`,
* `'longitude'`,
* `'price'`,
* `'minimum_nights'`,
* `'number_of_reviews'`,
* `'reviews_per_month'`,
* `'calculated_host_listings_count'`,
* `'availability_365'`

Select only them.


In [5]:
df = pd.read_csv('../data/raw/AB_NYC_2019-hw2.csv', index_col=['id']).reset_index('id', drop=True)

In [54]:
df = df[[ 'neighbourhood_group', 'room_type', 'latitude', 'longitude', 'price', 'minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365' ]]
df

Unnamed: 0,neighbourhood_group,room_type,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,Brooklyn,Private room,40.64749,-73.97237,149,1,9,0.21,6,365
1,Manhattan,Entire home/apt,40.75362,-73.98377,225,1,45,0.38,2,355
2,Manhattan,Private room,40.80902,-73.94190,150,3,0,,1,365
3,Brooklyn,Entire home/apt,40.68514,-73.95976,89,1,270,4.64,1,194
4,Manhattan,Entire home/apt,40.79851,-73.94399,80,10,9,0.10,1,0
...,...,...,...,...,...,...,...,...,...,...
48890,Brooklyn,Private room,40.67853,-73.94995,70,2,0,,2,9
48891,Brooklyn,Private room,40.70184,-73.93317,40,4,0,,2,36
48892,Manhattan,Entire home/apt,40.81475,-73.94867,115,10,0,,1,27
48893,Manhattan,Shared room,40.75751,-73.99112,55,1,0,,6,2


### Question 1

What is the most frequent observation (mode) for the column `'neighbourhood_group'`?

In [11]:
df['neighbourhood_group'].value_counts() # answer is Manhattan

Manhattan        21661
Brooklyn         20104
Queens            5666
Bronx             1091
Staten Island      373
Name: neighbourhood_group, dtype: int64

### Split the data

* Split your data in train/val/test sets, with 60%/20%/20% distribution.
* Use Scikit-Learn for that (the `train_test_split` function) and set the seed to 42.
* Make sure that the target value ('price') is not in your dataframe.

In [16]:
np.random.seed(42)

X = df.loc[:, ~df.columns.isin(['price'])].copy()
y = df.loc[:, ["price"]]

X_full_train, X_test, y_full_train, y_test = train_test_split(X,y, test_size=0.20)
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.25)

assert 'price' not in X_full_train.columns



### Question 2

* Create the [correlation matrix](https://www.google.com/search?q=correlation+matrix) for the numerical features of your train dataset.
   * In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.
* What are the two features that have the biggest correlation in this dataset?
#### Answer 2:  
reviews_per_month  <==>  number_of_reviews  <==> 0.548304

<!-- 
Example of a correlation matrix for the car price dataset:

<img src="images/correlation-matrix.png" /> -->

In [37]:
X_train.select_dtypes(include=["number"]).corr().stack()[lambda r: (-1<r) & (r<1)].sort_values(ascending=False)

number_of_reviews               reviews_per_month                 0.548304
reviews_per_month               number_of_reviews                 0.548304
calculated_host_listings_count  availability_365                  0.224450
availability_365                calculated_host_listings_count    0.224450
                                reviews_per_month                 0.187967
reviews_per_month               availability_365                  0.187967
number_of_reviews               availability_365                  0.173581
availability_365                number_of_reviews                 0.173581
longitude                       reviews_per_month                 0.145313
reviews_per_month               longitude                         0.145313
availability_365                minimum_nights                    0.144307
minimum_nights                  availability_365                  0.144307
                                calculated_host_listings_count    0.127504
calculated_host_listings_

### Question 3

* Calculate the mutual information score for the two categorical variables that we have. Use the training set only.
* Which of these two variables has bigger score?
* Round it to 2 decimal digits using `round(score, 2)`

In [41]:
from sklearn.metrics import mutual_info_score

In [50]:
cat_features = X_train.select_dtypes(include='object')
cat_features

Unnamed: 0,neighbourhood_group,room_type
9974,Queens,Private room
2685,Manhattan,Entire home/apt
22102,Brooklyn,Private room
39168,Brooklyn,Private room
36467,Manhattan,Private room
...,...,...
26618,Manhattan,Private room
10752,Manhattan,Private room
17506,Queens,Private room
9290,Brooklyn,Entire home/apt


In [53]:
mutual_info_score(cat_features.neighbourhood_group, cat_features.room_type), mutual_info_score(cat_features.room_type, cat_features.neighbourhood_group) # ?

# I don't know what is the answer in this question...

(0.016558962473902455, 0.016558962473902455)

### Question 4

* Now let's train a logistic regression
* For that, we need to turn our price prediction problem into a binary classification task.
* Let's create a variable `above_average` which is `1` if the price is above (or equal to) `152`.
* Remember that we have two categorical variables in the data. Include them using one-hot encoding.
* Fit the model on the training dataset.
   * To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
   * `model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42)`
* Calculate the accuracy on the validation dataset and rount it to 2 decimal digits.

In [94]:
def set_above_average(y):
    return y.assign(above_average = (y.price >= 152).astype('int'))

y_full_train = set_above_average(y_full_train)
y_train = set_above_average(y_train)
y_val = set_above_average(y_val)
y_test = set_above_average(y_test)

In [95]:
X_train.select_dtypes(['object']).nunique()

neighbourhood_group    5
room_type              3
dtype: int64

In [128]:
from sklearn.feature_extraction import DictVectorizer
# from sklearn.preprocessing import OneHotEncoder

class PrepareInput():
    def __init__(self):
        self.vect = DictVectorizer(sparse=False)
        
    def fit(self, X):
        train_dict = X.to_dict(orient='records')
        self.vect.fit(train_dict)
    
    def transform(self, X):
        X = X.copy()
        # fillna with 0...
        X = X.fillna(0)
        return self.vect.transform(X.to_dict(orient='records'))

In [129]:
fi = PrepareInput()
fi.fit(X_train)

In [130]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42, verbose=True)

In [131]:
model.fit(fi.transform(X_train), y_train['above_average'])

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s finished


LogisticRegression(random_state=42, verbose=True)

In [132]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_val['above_average'], model.predict(fi.transform(X_val))))

print((model.predict(fi.transform(X_val)) == y_val['above_average']).mean())

0.7868128272251309
0.7868128272251309


In [134]:
acc_val_all_features = accuracy_score(y_val['above_average'], model.predict(fi.transform(X_val)))
acc_train_all_features = accuracy_score(y_train['above_average'], model.predict(fi.transform(X_train)))

In [134]:
# answer of Q4 is 0.78

### Question 5

* We have 10 features: 8 numerical features and 2 categorical.
* Let's find the least useful one using the *feature elimination* technique.
* Train a model with all these features (using the same parameters as in Q4).
* Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
* For each feature, calculate the difference between the original accuracy and the accuracy without the feature. 
* Which of following feature has the smallest difference? 
   * `neighbourhood_group`
   * `room_type` 
   * `number_of_reviews`
   * `reviews_per_month`

> **note**: the difference doesn't have to be positive

In [138]:
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

In [144]:
print(f"""
With all features: 
- training: {acc_train_all_features}
- validation: {acc_val_all_features}
{50*"*"}
""")

differences = {}

for column in X_train.columns:
    # remove column from train
    model_5 = LogisticRegression(solver='lbfgs', C=1.0, random_state=42, verbose=False)
    
    model_5.fit(fi.transform(X_train), y_train['above_average'])
    train_selected = X_train.loc[:, lambda df: ~df.columns.isin([column])]
    score_train = accuracy_score(y_train['above_average'], model_5.predict(fi.transform(train_selected)))
    
    val_selected = X_val.loc[:, lambda df: ~df.columns.isin([column])]
    score_val = accuracy_score(y_val['above_average'], model_5.predict(fi.transform(val_selected)))
    
    print(f"""
    Excluding Feature {column} - Training: {score_train} - Validation: {score_val}
    Difference: training {acc_train_all_features - score_train : .6f} - Validation: {acc_val_all_features - score_val:.6f} 
    {50*"*"}
    """)
    
    differences[column] = {"score_train": score_train, "score_val": score_val, "diff_train":acc_train_all_features - score_train, "diff_val":acc_val_all_features - score_val}

    


With all features: 
- training: 0.7897521202039759
- validation: 0.7868128272251309
**************************************************


    Excluding Feature neighbourhood_group - Training: 0.735758501268032 - Validation: 0.7285667539267016
    Difference: training  0.053994 - Validation: 0.058246 
    **************************************************
    

    Excluding Feature room_type - Training: 0.7097161244580186 - Validation: 0.7049247382198953
    Difference: training  0.080036 - Validation: 0.081888 
    **************************************************
    

    Excluding Feature latitude - Training: 0.3041640533391508 - Validation: 0.3067735602094241
    Difference: training  0.485588 - Validation: 0.480039 
    **************************************************
    

    Excluding Feature longitude - Training: 0.6958632161653623 - Validation: 0.693226439790576
    Difference: training  0.093889 - Validation: 0.093586 
    ************************************************

In [149]:
{key: value for key, value in differences.items() if key in ['neighbourhood_group', 'room_type', 'number_of_reviews', 'reviews_per_month', ] }

{'neighbourhood_group': {'score_train': 0.735758501268032,
  'score_val': 0.7285667539267016,
  'diff_train': 0.0539936189359439,
  'diff_val': 0.058246073298429346},
 'room_type': {'score_train': 0.7097161244580186,
  'score_val': 0.7049247382198953,
  'diff_train': 0.08003599574595732,
  'diff_val': 0.08188808900523559},
 'number_of_reviews': {'score_train': 0.7880886804286766,
  'score_val': 0.784604057591623,
  'diff_train': 0.001663439775299258,
  'diff_val': 0.002208769633507912},
 'reviews_per_month': {'score_train': 0.7894521556543318,
  'score_val': 0.7860765706806283,
  'diff_train': 0.0002999645496440939,
  'diff_val': 0.0007362565445026004}}

In [None]:
# answer is reviews_per_month

### Question 6

* For this question, we'll see how to use a linear regression model from Scikit-Learn
* We'll need to use the original column `'price'`. Apply the logarithmic transformation to this column.
* Fit the Ridge regression model on the training data.
* This model has a parameter `alpha`. Let's try the following values: `[0, 0.01, 0.1, 1, 10]`
* Which of these alphas leads to the best RMSE on the validation set? Round your RMSE scores to 3 decimal digits.

If there are multiple options, select the smallest `alpha`.

In [150]:
def set_price_log(y):
    return y.assign(price_log = np.log1p(y.price))

y_full_train = set_price_log(y_full_train)
y_train = set_price_log(y_train)
y_val = set_price_log(y_val)
y_test = set_price_log(y_test)

In [151]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_squared_log_error

alphas = [0, 0.01, 0.1, 1, 10]

for alpha in alphas: 
    model = Ridge(alpha = alpha, random_state=42)
    model.fit(fi.transform(X_train), y_train['price_log'])
    score_train = round(mean_squared_log_error(y_train.price_log, model.predict(fi.transform(X_train))), 4)
    score_val = round(mean_squared_log_error(y_val.price_log, model.predict(fi.transform(X_val))), 4)
    print(f"Model with alpha: {alpha} - Train {score_train} - Val {score_val}")
    


  return linalg.solve(A, Xy, sym_pos=True,


Model with alpha: 0 - Train 0.0077 - Val 0.0069
Model with alpha: 0.01 - Train 0.0077 - Val 0.0069
Model with alpha: 0.1 - Train 0.0077 - Val 0.0069
Model with alpha: 1 - Train 0.0077 - Val 0.0069
Model with alpha: 10 - Train 0.0077 - Val 0.0069


In [None]:
# answer same values for all alphas (?) 