In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
%matplotlib inline

df = pd.read_csv('./listings.csv')
df.head()
# my test
df['host_response_rate'] = df['host_response_rate'].str.replace(r'%', r'.0').astype('float') 
df['host_acceptance_rate'] = df['host_acceptance_rate'].str.replace(r'%', r'.0').astype('float') 
# corr = df.corr()
# corr.style.background_gradient(cmap='coolwarm')

In [2]:
def percentage(x):
	return x/100.0


In [5]:
def clean_data(df):
    '''
    INPUT
    df - pandas dataframe 
    
    OUTPUT
    X - A matrix holding all of the variables you want to consider when predicting the response
    y - the corresponding response vector
    
    This function cleans df using the following steps to produce X and y:
    1. Drop all the rows with no salaries
    2. Create X as all the columns that are not the Salary column
    3. Create y as the Salary column
    4. Drop the Salary, Respondent, and the ExpectedSalary columns from X
    5. For each numeric variable in X, fill the column with the mean value of the column.
    6. Create dummy columns for all the categorical variables in X, drop the original columns
    '''
    # Drop rows with missing salary values
    df = df.dropna(subset=['review_scores_rating'], axis=0)
    y = df['review_scores_rating']
    y = y.apply(percentage)
    #Drop respondent and expected salary columns  
    df = df[['host_response_time','host_response_rate','host_acceptance_rate','host_is_superhost','host_has_profile_pic']]
    
    # Fill numeric columns with the mean
    num_vars = df.select_dtypes(include=['float', 'int']).columns
    for col in num_vars:
        df[col].fillna((df[col].mean()), inplace=True)
    
    # Dummy the categorical variables
    cat_vars = df.select_dtypes(include=['object']).copy().columns
    for var in  cat_vars:
        # for each cat add dummy var, drop original column
        df = pd.concat([df.drop(var, axis=1), pd.get_dummies(df[var], prefix=var, prefix_sep='_', drop_first=True)], axis=1)
        
    X = df
    return X, y

#Use the function to create X and y
X, y = clean_data(df) 


In [9]:
# Split data into training and test data, and fit a linear model
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=.15, random_state=42)
# pd.set_option("max_rows", None)

lm_2_model = LinearRegression(normalize=True)
# If our model works, it should just fit our model to the data. Otherwise, it will let us know.
try:
    lm_2_model.fit(X_train, y_train)
except Exception as e:
    print(e)
# print(X_train.head())
y_test_preds = lm_2_model.predict(X_test)# Predictions here
r2_test = r2_score(y_test, y_test_preds)
print("R2 score =",r2_test)
mse = mean_squared_error(y_test, y_test_preds)
print("Mean Square error score =",mse)

R2 score = 0.07938697268574035
Mean Square error score = 0.004668375856770024


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)




In [66]:
X,y = clean_data(df) 
# print(X)
# print(y)
merged = X.merge(y.to_frame(), left_index=True, right_index=True)
# print("Merged = ",merged)

# newDf = pd.concat([X,y],ignore_index=True)
# toBeSortedDF = newDf[['review_scores_rating','host_response_time','host_response_rate','host_acceptance_rate','host_is_superhost','host_has_profile_pic']]
sortedDF = merged.sort_values(by=['review_scores_rating'],ascending=False)
sortedDF = sortedDF.rename(columns={"host_response_rate": "hrr", "host_acceptance_rate": "har","host_response_time_within a day": "hrtwad", "host_response_time_within a few hours": "hrtwafh",
                        "host_response_time_within an hour": "hrtwah", "host_is_superhost_t" : "superhost", "host_has_profile_pic_t":"prof_pic", "review_scores_rating":"rating"})
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', None)
# print(sortedDF.head(1000))
# print(y.sort_values(ascending=True))
# print(sortedDF[sortedDF['rating'].between(0, 0.8)])
print(sortedDF[sortedDF['rating'].between(0, 0.75)])


             hrr        har  hrtwad  hrtwafh  hrtwah  superhost  prof_pic  rating
33     71.000000  100.00000       0        0       1          0         1    0.75
940    70.000000  100.00000       0        1       0          0         1    0.75
1133  100.000000  100.00000       0        0       1          0         1    0.75
732    80.000000  100.00000       0        1       0          0         1    0.74
2859  100.000000  100.00000       0        0       1          0         1    0.74
718    80.000000  100.00000       0        1       0          0         1    0.73
1190  100.000000  100.00000       1        0       0          0         1    0.73
1126  100.000000   99.96288       1        0       0          0         1    0.73
2777   43.000000  100.00000       0        0       0          0         1    0.73
1512   88.000000  100.00000       1        0       0          0         1    0.73
1153  100.000000   99.96288       0        1       0          0         1    0.73
352    40.000000