In [105]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
%matplotlib inline

df = pd.read_csv('./listings.csv')
df.head()
# my test
df['host_response_rate'] = df['host_response_rate'].str.replace(r'%', r'.0').astype('float') 
df['host_acceptance_rate'] = df['host_acceptance_rate'].str.replace(r'%', r'.0').astype('float') 
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,id,scrape_id,host_id,host_response_rate,host_acceptance_rate,host_listings_count,host_total_listings_count,latitude,longitude,accommodates,bathrooms,bedrooms,beds,square_feet,guests_included,minimum_nights,maximum_nights,availability_30,availability_60,availability_90,availability_365,number_of_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,license,calculated_host_listings_count,reviews_per_month
id,1.0,,0.546514,-0.025325,0.006042,-0.023312,-0.023312,-0.016143,0.01336,-0.074279,-0.061821,-0.064113,-0.050568,-0.170288,-0.103058,-0.021601,0.101529,-0.068067,-0.075651,-0.086914,-0.159447,-0.506105,0.039246,0.045355,0.068112,-0.00471,0.003389,0.105444,0.087858,,-0.051348,0.091716
scrape_id,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
host_id,0.546514,,1.0,-0.024539,-0.010372,-0.069613,-0.069613,-0.024217,0.014749,-0.09962,-0.057076,-0.075722,-0.065197,-0.18473,-0.083187,-0.024572,0.057532,-0.029677,-0.037683,-0.042542,-0.083078,-0.261822,0.027348,0.026768,0.047576,0.022116,0.025023,0.073277,0.078829,,-0.17604,0.106389
host_response_rate,-0.025325,,-0.024539,1.0,-0.007987,0.022147,0.022147,-0.021388,-0.012355,0.005433,0.008252,-0.004668,0.015823,0.02444,0.065056,0.006604,-0.082118,-0.04715,-0.035688,-0.031227,-0.087838,0.104779,0.156567,0.130925,0.170897,0.148516,0.155346,0.070827,0.149482,,-0.087481,0.17114
host_acceptance_rate,0.006042,,-0.010372,-0.007987,1.0,0.004365,0.004365,0.00452,-0.003054,0.012539,0.007817,0.005976,0.011714,,0.009642,-0.000645,-0.013509,-0.019501,-0.017578,-0.016466,-0.016732,0.008944,-0.016218,0.017815,-0.010815,-0.006956,-0.006511,-0.01196,0.012112,,0.006705,0.018555
host_listings_count,-0.023312,,-0.069613,0.022147,0.004365,1.0,1.0,-0.012511,-5.5e-05,0.11121,0.068226,0.065931,0.08549,-0.020224,-0.059289,0.001894,0.022684,0.119792,0.124743,0.124052,0.086038,-0.06222,-0.109357,-0.122957,-0.044087,-0.19073,-0.137222,0.024161,-0.114171,,0.224222,-0.117272
host_total_listings_count,-0.023312,,-0.069613,0.022147,0.004365,1.0,1.0,-0.012511,-5.5e-05,0.11121,0.068226,0.065931,0.08549,-0.020224,-0.059289,0.001894,0.022684,0.119792,0.124743,0.124052,0.086038,-0.06222,-0.109357,-0.122957,-0.044087,-0.19073,-0.137222,0.024161,-0.114171,,0.224222,-0.117272
latitude,-0.016143,,-0.024217,-0.021388,0.00452,-0.012511,-0.012511,1.0,-0.155092,0.000335,-0.015003,0.0391,0.023,-0.048056,0.034452,-0.001222,-0.004705,-0.019751,-0.037074,-0.036991,0.000565,-0.032761,-0.038086,-0.015072,-0.038183,-0.018381,-0.025117,0.096746,-0.019488,,0.02675,-0.084988
longitude,0.01336,,0.014749,-0.012355,-0.003054,-5.5e-05,-5.5e-05,-0.155092,1.0,-0.071584,-0.017041,-0.055045,-0.067682,-0.107369,-0.023828,0.003406,-0.010435,-0.007231,0.002575,0.008444,-0.007926,-0.00826,-0.047121,-0.037005,-0.062576,-0.04699,-0.061539,-0.190567,-0.043979,,0.086343,0.002583
accommodates,-0.074279,,-0.09962,0.005433,0.012539,0.11121,0.11121,0.000335,-0.071584,1.0,0.538439,0.770974,0.861119,0.439057,0.532796,0.017097,0.003291,-0.043169,-0.048761,-0.060468,-0.031535,-0.072978,-0.013101,-0.049665,0.011646,-0.019664,-0.013208,-0.03752,-0.062041,,-0.029525,-0.14415


In [65]:
def percentage(x):
	return x/100.0


In [121]:
def clean_data(df):
    '''
    INPUT
    df - pandas dataframe 
    
    OUTPUT
    X - A matrix holding all of the variables you want to consider when predicting the response
    y - the corresponding response vector
    
    This function cleans df using the following steps to produce X and y:
    1. Drop all the rows with no salaries
    2. Create X as all the columns that are not the Salary column
    3. Create y as the Salary column
    4. Drop the Salary, Respondent, and the ExpectedSalary columns from X
    5. For each numeric variable in X, fill the column with the mean value of the column.
    6. Create dummy columns for all the categorical variables in X, drop the original columns
    '''
    # Drop rows with missing salary values
    df = df.dropna(subset=['review_scores_rating'], axis=0)
    y = df['review_scores_rating']
    y = y.apply(percentage)
    #Drop respondent and expected salary columns  
    df = df[['number_of_reviews','review_scores_rating','review_scores_accuracy','review_scores_cleanliness','review_scores_checkin','review_scores_communication','review_scores_location','review_scores_value','calculated_host_listings_count','reviews_per_month','host_response_time','host_response_rate','host_acceptance_rate','host_is_superhost','host_has_profile_pic']]
    
    # Fill numeric columns with the mean
    num_vars = df.select_dtypes(include=['float', 'int']).columns
    for col in num_vars:
        df[col].fillna((df[col].mean()), inplace=True)
    
    # Dummy the categorical variables
    cat_vars = df.select_dtypes(include=['object']).copy().columns
    for var in  cat_vars:
        # for each cat add dummy var, drop original column
        df = pd.concat([df.drop(var, axis=1), pd.get_dummies(df[var], prefix=var, prefix_sep='_', drop_first=True)], axis=1)
        
    X = df
    return X, y

#Use the function to create X and y
X, y = clean_data(df) 


In [122]:
# Split data into training and test data, and fit a linear model
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=.15, random_state=42)
# pd.set_option("max_rows", None)

lm_2_model = LinearRegression(normalize=True)
# If our model works, it should just fit our model to the data. Otherwise, it will let us know.
try:
    lm_2_model.fit(X_train, y_train)
except Exception as e:
    print(e)
# print(X_train.head())
y_test_preds = lm_2_model.predict(X_test)# Predictions here
r2_test = r2_score(y_test, y_test_preds)
print(r2_test)

1.0


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


