In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [2]:
df=pd.read_csv('AB_NYC_2019.csv')

In [3]:
df.dtypes

id                                  int64
name                               object
host_id                             int64
host_name                          object
neighbourhood_group                object
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
dtype: object

In [4]:
df.columns=df.columns.str.lower().str.replace(" ","_").str.replace("-","_")

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)
for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_').str.replace("-","_")

In [5]:
categorical=['neighbourhood_group','room_type']
numerical=['latitude', 'longitude','minimum_nights', 'number_of_reviews','reviews_per_month', 'calculated_host_listings_count',
       'availability_365']
target=['price']

In [6]:
df['reviews_per_month']=df['reviews_per_month'].fillna(0)

In [7]:
df.isnull().sum()

id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                     0
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In [8]:
df['neighbourhood_group'].value_counts()

manhattan        21661
brooklyn         20104
queens            5666
bronx             1091
staten_island      373
Name: neighbourhood_group, dtype: int64

# Q1. manhattan

In [9]:
df_hw3=df[categorical+numerical+target]

In [10]:
df_hw3.head(5).T

Unnamed: 0,0,1,2,3,4
neighbourhood_group,brooklyn,manhattan,manhattan,brooklyn,manhattan
room_type,private_room,entire_home/apt,private_room,entire_home/apt,entire_home/apt
latitude,40.64749,40.75362,40.80902,40.68514,40.79851
longitude,-73.97237,-73.98377,-73.9419,-73.95976,-73.94399
minimum_nights,1,1,3,1,10
number_of_reviews,9,45,0,270,9
reviews_per_month,0.21,0.38,0.0,4.64,0.1
calculated_host_listings_count,6,2,1,1,1
availability_365,365,355,365,194,0
price,149,225,150,89,80


In [11]:
from sklearn.model_selection import train_test_split

In [12]:
df_full_train, df_test = train_test_split(df_hw3, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [13]:
len(df_train), len(df_val), len(df_test)

(29337, 9779, 9779)

In [14]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [15]:
y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

del df_train['price']
del df_val['price']
del df_test['price']

In [16]:
df_train.head(5).T

Unnamed: 0,0,1,2,3,4
neighbourhood_group,brooklyn,manhattan,bronx,brooklyn,manhattan
room_type,entire_home/apt,private_room,entire_home/apt,entire_home/apt,private_room
latitude,40.7276,40.70847,40.83149,40.66448,40.74118
longitude,-73.94495,-74.00498,-73.92766,-73.99407,-74.00012
minimum_nights,3,1,40,2,1
number_of_reviews,29,0,0,3,48
reviews_per_month,0.7,0.0,0.0,0.08,1.8
calculated_host_listings_count,13,1,1,1,2
availability_365,50,7,0,0,67


In [17]:
df_full_train = df_full_train.reset_index(drop=True)

In [18]:
n=len(numerical)
n

7

In [19]:
corr_mat=pd.DataFrame()

for i in range(n):
    corr_mat[numerical[i]]=df_full_train[numerical].corrwith(df_full_train[numerical[i]]).abs().round(3)

In [20]:
corr_mat

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
latitude,1.0,0.081,0.025,0.012,0.014,0.02,0.008
longitude,0.081,1.0,0.063,0.058,0.134,0.115,0.083
minimum_nights,0.025,0.063,1.0,0.078,0.122,0.122,0.141
number_of_reviews,0.012,0.058,0.078,1.0,0.585,0.073,0.175
reviews_per_month,0.014,0.134,0.122,0.585,1.0,0.047,0.166
calculated_host_listings_count,0.02,0.115,0.122,0.073,0.047,1.0,0.223
availability_365,0.008,0.083,0.141,0.175,0.166,0.223,1.0


# Q2.number_of_reviews and reviews_per_month   0.585

In [21]:
above_average=(y_train>=152).astype(int)
above_average, len(above_average)

(array([0, 0, 0, ..., 1, 0, 0]), 29337)

In [22]:
from sklearn.metrics import mutual_info_score

In [23]:
def mutual_info_churn_score(series):
    return mutual_info_score(series, above_average)

In [24]:
mi = df_train[categorical].apply(mutual_info_churn_score)
mi.sort_values(ascending=False).round(2)

room_type              0.14
neighbourhood_group    0.05
dtype: float64

# Q3.room_type

In [25]:
from sklearn.feature_extraction import DictVectorizer

In [26]:
dv=DictVectorizer(sparse=False)

train_dicts=df_train[categorical+numerical].to_dict(orient='records')
X_train=dv.fit_transform(train_dicts)

val_dicts=df_val[categorical+numerical].to_dict(orient='records')
X_val=dv.transform(val_dicts)

In [27]:
len(X_train),len(y_train)

(29337, 29337)

In [28]:
from sklearn.linear_model import LogisticRegression

In [29]:
import warnings
warnings.filterwarnings('ignore')

In [30]:
model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42)
model.fit(X_train, above_average)

LogisticRegression(random_state=42)

In [31]:
y_pred=model.predict_proba(X_val)[:,1]
y_pred

array([0.03142876, 0.57800103, 0.41595876, ..., 0.09246818, 0.03238082,
       0.63867756])

In [32]:
price_decision=(y_pred>=0.5)
price_decision

array([False,  True, False, ..., False, False,  True])

In [33]:
above_average_val=(y_val>=152).astype(int)

In [39]:
accuracy=(above_average_val==price_decision).mean()
accuracy.round(2)

0.79

# Q4.  0.79

In [55]:
base=numerical+categorical
base

['latitude',
 'longitude',
 'minimum_nights',
 'number_of_reviews',
 'reviews_per_month',
 'calculated_host_listings_count',
 'availability_365',
 'neighbourhood_group',
 'room_type']

In [63]:
for i in range(len(base)):
    base=numerical+categorical
    del base[i]
    train_dicts=df_train[base].to_dict(orient='records')
    X_train_diff=dv.fit_transform(train_dicts)

    val_dicts=df_val[base].to_dict(orient='records')
    X_val_diff=dv.transform(val_dicts)
    model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42)
    model.fit(X_train_diff, above_average)
    y_pred=model.predict_proba(X_val_diff)[:,1]
    price_decision=(y_pred>=0.5)
    new_accuracy=(above_average_val==price_decision).mean()
    new_accuracy.round(2)
    accuracy_diff=(new_accuracy-accuracy).round(4)
    print(base[i],accuracy_diff)

longitude 0.0001
minimum_nights 0.0005
number_of_reviews -0.0009
reviews_per_month 0.0006
calculated_host_listings_count -0.0012
availability_365 0.0004
neighbourhood_group -0.0047
room_type -0.0353


# Q5. review_per_month

In [41]:
from sklearn.linear_model import Ridge

In [43]:
y_train_logs=np.log1p(y_train)
y_val_logs=np.log1p(y_val)

In [46]:
def rmse(y,y_pred):
    se=(y-y_pred)**2
    mse=se.mean()
    return np.sqrt(mse)

In [50]:
i=0
scores=np.zeros(5)
for a in [0, 0.01, 0.1, 1, 10]:
    model = Ridge(alpha=a)
    model.fit(X_train, y_train_logs)
    y_pred=model.predict(X_val)
    scores[i]=rmse(y_val_logs, y_pred).round(3)
    print(i," ",a," ",scores[i])
    i=i+1

0   0   0.497
1   0.01   0.497
2   0.1   0.497
3   1   0.497
4   10   0.498


# Q6. 0