Dataset: https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data?select=AB_NYC_2019.csv

In [3]:
import numpy as np
import pandas as pd

In [4]:
data = pd.read_csv('AB_NYC_2019.csv')
columns = ['neighbourhood_group','room_type','latitude','longitude','price','minimum_nights',
            'number_of_reviews','reviews_per_month','calculated_host_listings_count','availability_365']    

data = data[columns].fillna(0)

## Q1) Most frequent value for neighbourhood_group

In [6]:
data['neighbourhood_group'].mode(0)

0    Manhattan
dtype: object

In [9]:
# Split the data
from sklearn.model_selection import train_test_split

data_full_train, data_test = train_test_split(data, test_size=0.2, random_state=42)
data_train, data_val = train_test_split(data_full_train, test_size=0.25, random_state=42)
print(data_train.shape, data_val.shape, data_test.shape)

data_train = data_train.reset_index(drop=True)
data_val = data_val.reset_index(drop=True)
data_test = data_test.reset_index(drop=True)

y_train = data_train['price'].values
y_val = data_val['price'].values
y_test = data_test['price'].values

y_train_log = np.log1p(data_train.price.values)
y_val_log = np.log1p(data_val.price.values)
y_test_log = np.log1p(data_test.price.values)

del data_train['price']
del data_val['price']
del data_test['price']

(29337, 10) (9779, 10) (9779, 10)


## Q2) Features with biggest correlation

In [17]:
categorical = ['neighbourhood_group', 'room_type']
numerical = ['latitude', 'longitude', 'minimum_nights', 'number_of_reviews', 'reviews_per_month',
               'calculated_host_listings_count', 'availability_365']

data_train[numerical].corr("pearson")

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
latitude,1.0,0.080301,0.027441,-0.006246,-0.007159,0.019375,-0.005891
longitude,0.080301,1.0,-0.06066,0.055084,0.134642,-0.117041,0.083666
minimum_nights,0.027441,-0.06066,1.0,-0.07602,-0.120703,0.118647,0.138901
number_of_reviews,-0.006246,0.055084,-0.07602,1.0,0.590374,-0.073167,0.174477
reviews_per_month,-0.007159,0.134642,-0.120703,0.590374,1.0,-0.048767,0.165376
calculated_host_listings_count,0.019375,-0.117041,0.118647,-0.073167,-0.048767,1.0,0.225913
availability_365,-0.005891,0.083666,0.138901,0.174477,0.165376,0.225913,1.0


## Q3) Which variable has the highest mutual information?

In [20]:
data_train['above_avg'] = (y_train >= 152).astype('int')
y_train_binary = (y_train >= 152).astype("int")
y_val_binary = (y_val >= 152).astype("int")

from sklearn.metrics import mutual_info_score

def mutual_info_price_score(series):
    return mutual_info_score(series, data_train.above_avg).round(2)

MI = data_train[categorical].apply(mutual_info_price_score)
MI.sort_values(ascending=False)

room_type              0.14
neighbourhood_group    0.05
dtype: float64

## Q4) Accuracy of the model 

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)

train_dict = data_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = data_val[categorical + numerical].to_dict(orient='records')
X_val = dv.fit_transform(val_dict)

model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42).fit(X_train, y_train_binary)
y_pred = model.predict_proba(X_val)[:, 1]
acc = (y_pred >= 0.5)
score = (y_val_binary == acc).mean()
print(score)

0.79


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Q5) Feature selection - the smallest difference in accuracy

In [28]:
for col in (categorical + numerical):
    newlst = categorical + numerical
    newlst.remove(col)
    
    train_dict_col = data_train[newlst].to_dict(orient='records')
    X_train_col = dv.fit_transform(train_dict_col)

    val_dict_col = data_val[newlst].to_dict(orient='records')
    X_val_col = dv.fit_transform(val_dict_col)

    model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42).fit(X_train_col, y_train_binary)
    y_pred = model.predict_proba(X_val_col)[:, 1]
    acc = (y_pred >= 0.5)
    score_wo_col = (y_val_binary == acc).mean()
    print("Model difference accuracy based on val data without",col,":", score-score_wo_col)

Model difference accuracy based on val data without neighbourhood_group : 0.03900296553839866


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model difference accuracy based on val data without room_type : 0.07356682687391347


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model difference accuracy based on val data without latitude : 0.003621024644646753


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model difference accuracy based on val data without longitude : 0.0032119848655282057


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model difference accuracy based on val data without minimum_nights : 0.004234584313324463


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model difference accuracy based on val data without number_of_reviews : 0.003109724920748569


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model difference accuracy based on val data without reviews_per_month : 0.004745884037222647


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model difference accuracy based on val data without calculated_host_listings_count : 0.0033142448103078426
Model difference accuracy based on val data without availability_365 : 0.008427242049289352


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Q6) Regression with Scikit-Learn. What's the best alpha?

In [32]:
from sklearn.linear_model import Ridge

def rmse(y, y_pred):
    error = y-y_pred
    se = error ** 2
    mse = se.mean()
    return np.sqrt(mse)


alphas = [0, 0.01, 0.1, 1, 10]
for alpha in alphas:
    model = Ridge(alpha=alpha).fit(X_train, y_train_log)
    y_log_pred = model.predict(X_val)
    print("RMSE on linear regression model with alpha:",alpha,"is:", round(rmse(y_val_log, y_log_pred),3))

RMSE on linear regression model with alpha: 0 is: 0.497
RMSE on linear regression model with alpha: 0.01 is: 0.497
RMSE on linear regression model with alpha: 0.1 is: 0.497
RMSE on linear regression model with alpha: 1 is: 0.497
RMSE on linear regression model with alpha: 10 is: 0.498


  return linalg.solve(A, Xy, sym_pos=True,
