In [276]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 
from sklearn.feature_extraction import DictVectorizer

In [277]:
mydata = pd.read_csv("AB_NYC_2019.csv")
mydata.head()
mydata.describe()

columns = ["neighbourhood_group", "room_type", "latitude", "longitude", "price", "minimum_nights", "number_of_reviews", 
           "reviews_per_month", "calculated_host_listings_count", "availability_365"]

mydata2 = mydata.copy()
mydata2 = mydata2[columns]
print(mydata2.head())


  neighbourhood_group        room_type  latitude  longitude  price  \
0            Brooklyn     Private room  40.64749  -73.97237    149   
1           Manhattan  Entire home/apt  40.75362  -73.98377    225   
2           Manhattan     Private room  40.80902  -73.94190    150   
3            Brooklyn  Entire home/apt  40.68514  -73.95976     89   
4           Manhattan  Entire home/apt  40.79851  -73.94399     80   

   minimum_nights  number_of_reviews  reviews_per_month  \
0               1                  9               0.21   
1               1                 45               0.38   
2               3                  0                NaN   
3               1                270               4.64   
4              10                  9               0.10   

   calculated_host_listings_count  availability_365  
0                               6               365  
1                               2               355  
2                               1               365  
3       

In [278]:
print(mydata2.isna().sum())

neighbourhood_group                   0
room_type                             0
latitude                              0
longitude                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64


In [279]:
mydata2 = mydata2.fillna(0.0)
print(mydata2.isna().sum())

neighbourhood_group               0
room_type                         0
latitude                          0
longitude                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
dtype: int64


# Question 1


In [280]:
mydata2["neighbourhood_group"].mode()

0    Manhattan
dtype: object

In [281]:
X_full, X_test = train_test_split(mydata2, test_size = 0.2, random_state = 42)
X_train, X_val = train_test_split(X_full, test_size = 0.25, random_state = 42)

print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(29337, 10)
(9779, 10)
(9779, 10)


In [282]:
X_train = X_train.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

In [283]:
y_train = X_train.price
y_val = X_val.price
y_test = X_test.price

print(y_train.shape)
print(y_val.shape)
print(y_test.shape)

(29337,)
(9779,)
(9779,)


In [284]:
del X_train["price"]
del X_val["price"]
del X_test["price"]


In [285]:
X_train.head()

Unnamed: 0,neighbourhood_group,room_type,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,Brooklyn,Entire home/apt,40.7276,-73.94495,3,29,0.7,13,50
1,Manhattan,Private room,40.70847,-74.00498,1,0,0.0,1,7
2,Bronx,Entire home/apt,40.83149,-73.92766,40,0,0.0,1,0
3,Brooklyn,Entire home/apt,40.66448,-73.99407,2,3,0.08,1,0
4,Manhattan,Private room,40.74118,-74.00012,1,48,1.8,2,67


# Question 2

In [286]:
mydata2.dtypes
numericals = ["latitude", "longitude", "reviews_per_month", "minimum_nights", "number_of_reviews", "calculated_host_listings_count", "availability_365"]
categorical = ["neighbourhood_group", "room_type"]

In [287]:
corrM = mydata2[numericals].corr()
corrM

Unnamed: 0,latitude,longitude,reviews_per_month,minimum_nights,number_of_reviews,calculated_host_listings_count,availability_365
latitude,1.0,0.084788,-0.018758,0.024869,-0.015389,0.019517,-0.010983
longitude,0.084788,1.0,0.138516,-0.062747,0.059094,-0.114713,0.082731
reviews_per_month,-0.018758,0.138516,1.0,-0.124905,0.589407,-0.047312,0.163732
minimum_nights,0.024869,-0.062747,-0.124905,1.0,-0.080116,0.12796,0.144303
number_of_reviews,-0.015389,0.059094,0.589407,-0.080116,1.0,-0.072376,0.172028
calculated_host_listings_count,0.019517,-0.114713,-0.047312,0.12796,-0.072376,1.0,0.225701
availability_365,-0.010983,0.082731,0.163732,0.144303,0.172028,0.225701,1.0


In [288]:
above_average_train = np.where(y_train >= 152, 1, 0)
above_average_train

array([0, 0, 0, ..., 1, 0, 0])

# Question 3

In [289]:
round(sklearn.metrics.mutual_info_score(X_train[categorical[0]], above_average_train), 2)

0.05

In [290]:
round(sklearn.metrics.mutual_info_score(X_train[categorical[1]], above_average_train), 2)

0.14

# Question 4

In [291]:
dv = DictVectorizer(sparse = False)

X_train_dict = X_train[categorical + numericals].to_dict(orient = 'records')
print(X_train_dict[0])

X_train2 = dv.fit_transform(X_train_dict)

print(X_train2.shape)
print(y_train.shape)

{'neighbourhood_group': 'Brooklyn', 'room_type': 'Entire home/apt', 'latitude': 40.7276, 'longitude': -73.94495, 'reviews_per_month': 0.7, 'minimum_nights': 3, 'number_of_reviews': 29, 'calculated_host_listings_count': 13, 'availability_365': 50}
(29337, 15)
(29337,)


In [292]:
model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42, max_iter=1000)

model.fit(X_train2, above_average_train)

LogisticRegression(max_iter=1000, random_state=42)

In [293]:
X_val_dict = X_val[categorical + numericals].to_dict(orient = 'records')
X_val2 = dv.fit_transform(X_val_dict)

above_average_val = np.where(y_val >= 152, 1, 0)

y_pred = model.predict(X_val2)

In [294]:
df_pred = pd.DataFrame()
df_pred["target"] = above_average
df_pred["Pred"] = y_pred.astype(int)
df_pred["Match"] = df_pred["target"] == df_pred["Pred"]

df_pred.head()

Overall_Accuracy = df_pred["Match"].mean()
df_pred["Match"].mean()

0.790878412925657

# Question 5


In [295]:
features = categorical + numericals
features

['neighbourhood_group',
 'room_type',
 'latitude',
 'longitude',
 'reviews_per_month',
 'minimum_nights',
 'number_of_reviews',
 'calculated_host_listings_count',
 'availability_365']

In [296]:
def all_models(features, i):
    
    feats = features.copy()
    feats.pop(i)
    
    X_train_dict = X_train[feats].to_dict(orient = 'records')
    X_train2 = dv.fit_transform(X_train_dict)
    
    model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42, max_iter=1000)
    model.fit(X_train2, above_average_train)
    
    X_val_dict = X_val[feats].to_dict(orient = 'records')
    X_val2 = dv.fit_transform(X_val_dict)

    y_pred = model.predict(X_val2)
    
    return (y_pred == above_average_val).mean()

In [297]:


for i in range(0, len(features)):
    accuracy = all_models(features, i)
    Difference = Overall_Accuracy - accuracy
    print(f"Without {features[i]}, difference with overall accuracy is: {Difference}")

Without neighbourhood_group, difference with overall accuracy is: 0.039881378464055595
Without room_type, difference with overall accuracy is: 0.06237856631557415
Without latitude, difference with overall accuracy is: 0.004090397791185141
Without longitude, difference with overall accuracy is: 0.003988137846405504
Without reviews_per_month, difference with overall accuracy is: 0.00010225994477952582
Without minimum_nights, difference with overall accuracy is: 0.0
Without number_of_reviews, difference with overall accuracy is: -0.0006135596686778211
Without calculated_host_listings_count, difference with overall accuracy is: 0.0016361591164740785
Without availability_365, difference with overall accuracy is: 0.009714694754064834


# Question 6

In [298]:
y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)

In [299]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

a = [0, 0.01, 0.1, 1, 10]

for i in range(0, len(a)):
    
    reg_model = Ridge(alpha = i)

    reg_model.fit(X_train2, y_train_log)

    y_pred = reg_model.predict(X_val2)

    print(f"For alpha: {a[i]}, RMSE= {round(mean_squared_error(y_pred, y_val_log), 3)}")

For alpha: 0, RMSE= 0.247
For alpha: 0.01, RMSE= 0.247
For alpha: 0.1, RMSE= 0.247
For alpha: 1, RMSE= 0.247
For alpha: 10, RMSE= 0.247


  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
