In [2]:
import pandas as pd
import numpy as np
import math
from sklearn import decomposition
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA 

In [3]:
#Question 8:
listings_df = pd.read_csv('listings_new.csv')

#Take only desire columns
listings_df = listings_df[['host_response_rate', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication','positivity_mean','negativity_mean','positivity_simple_mean','negativity_simple_mean', 'price']]
listings_df["host_response_rate"] = listings_df["host_response_rate"].str.rstrip('%').astype('float') 
listings_df["price"] = listings_df["price"].str.strip('$ ')
listings_df["price"] = listings_df["price"].str.replace(',','').astype('float')
#Drop NA from these columns
listings_df.dropna(subset=['host_response_rate', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication','positivity_mean','negativity_mean','positivity_simple_mean','negativity_simple_mean', 'price'], inplace = True)

In [6]:
X= listings_df.drop(columns = ['price'])
y = listings_df['price']

#Split the dataset into 30% test set and 70% train set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state =42 )

scaler = StandardScaler() 

#Transform both X_train and X_test using scaler
X_train = scaler.fit_transform(X_train) 
X_test = scaler.transform(X_test) 

pca = PCA(n_components = 3) 

#Transform both X_train and X_test using PCA
X_train = pca.fit_transform(X_train) 
X_test = pca.transform(X_test) 
  
explained_variance = pca.explained_variance_ratio_

#Linear model
lm = LinearRegression() 
reg = lm.fit(X_train, y_train) 
coef = reg.coef_
print("The coefficients are", coef)

The coefficients are [-3.14241923 -2.880723   -3.04429126]


In [5]:
# Build functions to calculate five measurements

def squared_error(y_observed,y_predicted):
    y_observed = np.array(y_observed, dtype = np.float64)
    y_predicted = np.array(y_predicted, dtype = np.float64)
    return sum((y_predicted - y_observed) * (y_predicted - y_observed))

def r_squared(y_observed,y_predicted):
    y_observed = np.array(y_observed, dtype = np.float64)
    y_predicted = np.array(y_predicted, dtype = np.float64)
    y_mean = [y_observed.mean() for y in y_observed]
    squared_error_regr = squared_error(y_observed, y_predicted)
    squared_error_mean = squared_error(y_observed, y_mean)
    return 1 - (squared_error_regr/squared_error_mean) 

def mean_squared_error(y_observed,y_predicted):
    y_observed = np.array(y_observed, dtype = np.float64)
    y_predicted = np.array(y_predicted, dtype = np.float64)
    return (squared_error(y_observed,y_predicted))*(1/len(y_observed))

def root_mean_squared_error(y_observed, y_predicted):
    y_observed = np.array(y_observed, dtype = np.float64)
    y_predicted = np.array(y_predicted, dtype = np.float64)
    return math.sqrt(mean_squared_error(y_observed,y_predicted))

def MAPE(y_observed, y_predicted):
    y_observed = np.array(y_observed, dtype = np.float64)
    y_predicted = np.array(y_predicted, dtype = np.float64)
    return (abs((y_observed - y_predicted)/y_observed) * 100).mean()

In [9]:
#Create a table that shows error rates corresponding to PCA
d = {'CV': ['PCA'], 'R-squared': [r_squared(y_test, list(reg.predict(X_test)))], 'MSE': [mean_squared_error(y_test, list(reg.predict(X_test)))], 'Root MSE': [root_mean_squared_error(y_test, list(reg.predict(X_test)))], 'MAPE': [MAPE(y_test, list(reg.predict(X_test)))]}
table = pd.DataFrame(d)

In [10]:
table

Unnamed: 0,CV,R-squared,MSE,Root MSE,MAPE
0,PCA,-0.000458,14319.729914,119.665074,70.287971


In [11]:
#Load error rate table from Part III and append PCA error rate table
error_table = pd.read_csv('error_table.csv')
error_table.append(table)

Unnamed: 0,CV,R-squared,MSE,Root MSE,MAPE
0,Train/Test CV,0.036,11762.56,108.46,67.75
1,K-Fold CV,0.035,11779.53,108.53,71.36
2,LOOCV,0.038,11737.48,108.34,67.94
0,PCA,-0.000458,14319.729914,119.665074,70.287971


In [12]:
#Convert X to dataframe then to CSV for later use
pd.DataFrame(np.concatenate((X_train, X_test), 0)).to_csv('pca.csv', index=False)

In [None]:
"""The result of cross validations explain more variantion in the fit of linear regression. Because:
    1. The r-squared for PCA is negative and smaller than the r-squareds of cross validations
    2. Lower Mean Squared Error(MSE) means the predicated values are closer to the actual values. The MSE for PCA is higher than cross validations.
    3. The Root Mean Squared Error(RMSE) for PCA is higher than cross validations.
    4. The Mean Absolute Percent Error(MAPE) for PCA is bigger than the MAPE for train/test CV and LOOCV. 
"""