## References


https://www.kaggle.com/airbnb/seattle/data

## Business Questions - Brainstorming

 - Can we use this dataset to predict the price with a R^2 > 0.8?
  - - am I able to understand the output of the LM or LASSO?
 - Does distance from the center of Seattle significantly influence the pricing model?
  - - 
 - Does our dataset contain characritics that are internally correlated to other characteristics
 - Identify pricing outliers
 
 occupancy rate Vs pricing effect?
 

## Importing Relevant Libraries


In [None]:
import pandas as pd
import numpy as np


#scikit learn packages
import sklearn.metrics
from sklearn.metrics import r2_score, mean_squared_error
#from sklearn.metrics import confusion_matrix
#from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression,LassoCV,RidgeCV
#from sklearn.metrics import r2_score, mean_squared_error

#Graphics
import matplotlib.pyplot as plt
import seaborn as sns
#from sklearn.tree import export_graphviz
#import pydot

#Time functions
#import time


In [None]:
import pandas as pd
import numpy as np

## Importing Relevant Data

In [None]:
df_listing=pd.read_csv("listings.csv")
#f_calendar=pd.read_csv("calendar")
#f_reviews=

## Exploratory Data Analysis

### Display settings

In [None]:
#https://realpython.com/pandas-python-explore-dataset/
pd.set_option("display.max.columns", None)
pd.set_option("display.precision", 2)

### Inspection

In [None]:
df_listing.shape

In [None]:
df_listing.head(3)

In [None]:
df_listing.info()

In [None]:
for column in df_listing.columns:
     print("\n" +"\n"+"\n"+ "#######################   " + column+"   #######################")
     print(df_listing[column].value_counts())

### Useful column categorizations

In [None]:
df_listing.columns

In [None]:
baseline_cols=['neighbourhood_group_cleansed','property_type','accommodates',
       'price','number_of_reviews','review_scores_rating']

#columns filled with Null data, uncleaned data
nonvalue_cols=["experiences_offered","neighbourhood","neighbourhood_cleansed","city","state","market",
               "smart_location","country_code","country","has_availability","calendar_last_scraped",
               "requires_license","jurisdiction_names","license","host_acceptance_rate","last_review",
               "first_review","calendar_updated","host_neighbourhood","zipcode","bed_type"]

metadata_cols=["id","listing_url","scrape_id","last_scraped","thumbnail_url","medium_url","picture_url",
               "xl_picture_url","host_id","host_url","host_name","host_since","host_location","host_thumbnail_url",
               "host_picture_url","host_total_listings_count",
               "latitude","longitude"]

concat_cols=["host_verifications","amenities"]

textblock_cols=["name","summary","space","description","neighborhood_overview","notes","transit","host_about","street","host_verifications","amenities"]

dollar_cols=["cleaning_fee","extra_people","monthly_price","price","security_deposit","weekly_price"]

bool_cols=["host_is_superhost","host_has_profile_pic","host_identity_verified","instant_bookable",
"is_location_exact","require_guest_phone_verification","require_guest_profile_picture"]

percentage_cols=["host_response_rate"]

multicolinearity_cols=["availability_60","availability_90","beds"]

### Which columns are missing data?

In [None]:
def display_nans(df,cut_off):
    '''
    INPUT : Dataframe
            cut_off decimal to remove columns with less than this value of Nan
    
    OUTPUT: Graph with % of nan values in each column

    '''
    
    global df_nan
    df_nan = pd.DataFrame(df.isna().sum() / df.shape[0]).reset_index()
    df_nan.rename(columns={'index':'column',0:'na_per'},inplace=True)
    
    # sort df by Count column
    df_nan = df_nan.sort_values(['na_per'],ascending=False).reset_index(drop=True)
    
    # Remove fll columns
    df_nan=df_nan[df_nan["na_per"]>cut_off]

    base_color = sns.color_palette()[0]
    plt.figure(figsize=(16,16))
    plt.title('Percentage of missing values by column')
    sns.barplot(data=df_nan,y='column', x='na_per', color=base_color)
    
    return df_nan
display_nans(df_listing,0.5)

## Data Cleaning

In [None]:
def clean_df(df,target,exclusion_list=[],dollar_cols=[],bool_cols=[],percentage_cols=[]):
    
        #Drop rows where the target has missing values
        df  = df.dropna(subset=[target], axis=0)
        
        #drop exclusion list
        df=df.drop(exclusion_list, axis=1)
        
        #expand concatenated columns
        ######
        
        #
        for col in dollar_cols:
            try :df[col]=df[col].str.replace("[$, ]", "")  
            except: continue
            df[col]=df[col].astype("float")
            
        # text bool to bool type

        dic={'t': True, 'f': False}
        for col in bool_cols:
            df[col]=df[col].replace(dic).astype("bool")

        
        for col in percentage_cols:
            df[col]=df[col].str.replace("[%, ]", "")  
            df[col]=df[col].astype("float")
            df[col] = df[col].div(100)
        
        display_nans(df,0.3)
        nan_cols=df_nan["column"]
        df=df.drop(nan_cols,axis=1)
        
        numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
        df_listing_num = df.select_dtypes(include=numerics)
        # Mean function
        fill_mean = lambda col: col.fillna(col.mean())
        # Fill the mean
        df_listing_num = df_listing_num.apply(fill_mean, axis=0)

        #Pull a list of the column names of the categorical variables
        cat_df = df.select_dtypes(include=['object'])
        cat_cols = cat_df.columns

        df_listing_cat=df.copy()

        for col in  cat_cols:
            df_listing_cat = pd.concat([df_listing_cat.drop(col, axis=1), pd.get_dummies(df_listing_cat[col], prefix=col, prefix_sep='_', drop_first=True, dummy_na=True)], axis=1)

        # Mean function
        fill_mean = lambda col: col.fillna(col.mean())
        # Fill the mean
        df_listing_cat = df_listing_cat.apply(fill_mean, axis=0)


### Remove non value added columns

In [None]:
df_listing=df_listing.drop(nonvalue_cols, axis=1)
df_listing.head()

### Drop metadata columns

In [None]:
df_listing=df_listing.drop(metadata_cols, axis=1)
df_listing.head()

### Drop text blocks

In [None]:
df_listing=df_listing.drop(textblock_cols, axis=1)
df_listing.head()

### Expand concatenated columns

https://towardsdatascience.com/categorical-encoding-using-label-encoding-and-one-hot-encoder-911ef77fb5bd
    
https://stackoverflow.com/questions/28121682/quickest-way-to-make-a-get-dummies-type-dataframe-from-a-column-with-a-multiple


I would separare it into different elements
And then have all the unique values as columbs
And then do an IF formula
That puts a 1 if it contains the word from the columb


def clean_to_text_dummies(df,col_list,characters,):
    
    # clean characters
    # https://stackoverflow.com/questions/13682044/remove-unwanted-parts-from-strings-in-a-column
    try :df[col]=df[col].str.replace("[$, ]", "")
    
    https://www.rexegg.com/regex-quickstart.html
    for col in col_list:
    df_listing[col].str.get_dummies(sep=',')

df['col2'].str.get_dummies(sep=',')
pd.concat([df, df['col2'].str.get_dummies(sep=',')], axis=1)

### Convert dollars to floats

Inspection of the data shows that there are many currency/numeric columns that are captured as objects as they contain "$"

Let's clean this up

In [None]:
df_listing.head()

In [None]:
def dollar_to_float(df,ls):
    '''
    INPUT : 
        dataframe
        list of columns
            
    OUTPUT: dataframe with specified columns stripped of $ and spaces and returned as floats
    '''
    for col in ls:
        try :df[col]=df[col].str.replace("[$, ]", "")  
        except: continue
        df[col]=df[col].astype("float")
    return df

In [None]:
list_of_obs=[]
df_listing=dollar_to_float(df_listing,dollar_cols)

In [None]:
df_listing.head()

### Convert t/f to boolean

In [None]:
def str_to_bool(df,col_list,dic):
    '''
    Input
    Output
    '''
    for col in col_list:
        df[col]=df[col].replace(dic).astype("bool")

In [None]:
dic={'t': True, 'f': False}

str_to_bool(df_listing,bool_cols,dic)

In [None]:
df_listing.info()

### Convert percentage strings to float

In [None]:
def perc_to_float(df,ls):
    '''
    INPUT : 
        dataframe
        list of columns
            
    OUTPUT: dataframe with specified columns stripped of '%', converted to a float and made a decimal
    '''
    for col in ls:
        df[col]=df[col].str.replace("[%, ]", "")  
        df[col]=df[col].astype("float")
        #df[col] = df[col].div(100)



def perc_to_float(df,ls):
    '''
    INPUT : 
        dataframe
        list of columns
            
    OUTPUT: dataframe with specified columns stripped of $ and spaces and returned as floats
    '''
    for col in ls:
        df[col] = df[col].str.rstrip('%').astype('float') /100

In [None]:
perc_to_float(df_listing,percentage_cols)
df_listing.head(20)


### Cleaning high % Nan Columns

In [None]:
display_nans(df_listing,0.3)


Here we can see that almost 100% of the listing's area is missing. Although this information would be useful (based on experience) it makes sense to drop this as does the remaining values above

I will remove these values and can readd them if Im not meeting my R^2 target

In [None]:
#nan_cols=["license","square_feet","monthly_price","security_deposit","weekly_price","cleaning_fee"]
nan_cols=df_nan["column"]
df_listing=df_listing.drop(nan_cols,axis=1)
df_listing.head()

### Remove observations where the target column is Nan

In [None]:
#Dropping where the price has missing values
df_listing  = df_listing.dropna(subset=['price'], axis=0)

df_listing

### Fill mean to Numeric Columns

In [None]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

df_listing_num = df_listing.select_dtypes(include=numerics)

In [None]:
# Mean function
fill_mean = lambda col: col.fillna(col.mean())
# Fill the mean
df_listing_num = df_listing_num.apply(fill_mean, axis=0)

In [None]:
df_listing_num.info()

### All Categorical objects to dummies

In [None]:
    #Pull a list of the column names of the categorical variables
    cat_df = df_listing.select_dtypes(include=['object'])
    cat_cols = cat_df.columns
    cat_cols

In [None]:
#dummy all the cat_cols
df_listing_cat=df_listing.copy()

for col in  cat_cols:
    df_listing_cat = pd.concat([df_listing_cat.drop(col, axis=1), pd.get_dummies(df_listing_cat[col], prefix=col, prefix_sep='_', drop_first=True, dummy_na=True)], axis=1)

df_listing_cat



In [None]:
    # Mean function
    fill_mean = lambda col: col.fillna(col.mean())
    # Fill the mean
    df_listing_cat = df_listing_cat.apply(fill_mean, axis=0)

    df_listing_cat

### Baseline dataset


In [None]:
#Dropping where the salary has missing values
df_listing_base  = df_listing_cat.drop(baseline_cols, axis=1)

df_listing_base

## Model 1: Linear Regression - Model based on Numeric values only

In [None]:
#Split into explanatory and response variables
X = df_listing_num.drop("price", axis=1)
y = df_listing_num["price"]    

#Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

lm_model = LinearRegression(normalize=True) # Instantiate
lm_model.fit(X_train, y_train) #Fit

#Predict using your model
y_test_preds = lm_model.predict(X_test)
y_train_preds = lm_model.predict(X_train)

#Score using your model
test_score = r2_score(y_test, y_test_preds)
train_score = r2_score(y_train, y_train_preds)

#sklearn.metrics.mean_absolute_percentage_error(y_test, y_test_preds, sample_weight=None, multioutput='uniform_average')

In [None]:
test_score

In [None]:
train_score

## Model 2: Linear Regression -  Model based on Select Numeric and categorical values only

In [None]:
#Split into explanatory and response variables
X = df_listing_cat.drop("price", axis=1)
y = df_listing_cat["price"]    

#Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

lm_model = LinearRegression(normalize=True) # Instantiate
lm_model.fit(X_train, y_train) #Fit

#Predict using your model
y_test_preds = lm_model.predict(X_test)
y_train_preds = lm_model.predict(X_train)

#Score using your model
test_score = r2_score(y_test, y_test_preds)
train_score = r2_score(y_train, y_train_preds)

#sklearn.metrics.mean_absolute_percentage_error(y_test, y_test_preds, sample_weight=None, multioutput='uniform_average')

In [None]:
test_score

In [None]:
train_score

## Model 3: Linear Regression - Model based on Select Numeric and categorical values only with multicolinearity reduced

### Multicollinearity Checks

In [None]:
# generating pairwise correlation
# https://stackoverflow.com/questions/17778394/list-highest-correlation-pairs-from-a-large-correlation-matrix-in-pandas
corr = df_listing_num.corr()

def get_redundant_pairs(df):
    '''Get diagonal and lower triangular pairs of correlation matrix'''
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

def get_top_abs_correlations(df, n=5):
    au_corr = df.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(df)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return au_corr[0:n]

print("Top Absolute Correlations")
print(get_top_abs_correlations(df_listing_num, 10))

In [None]:
# Displaying dataframe as an heatmap 
corr.style.background_gradient()

In [None]:
df_listing_cat_multico=df_listing_cat.drop(multicolinearity_cols,axis=1)
df_listing_cat_multico.head()

In [None]:
## Modelling

In [None]:
#Split into explanatory and response variables
X = df_listing_cat_multico.drop("price", axis=1)
y = df_listing_cat_multico["price"]    

#Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

lm_model = LinearRegression(normalize=True) # Instantiate
lm_model.fit(X_train, y_train) #Fit

#Predict using your model
y_test_preds = lm_model.predict(X_test)
y_train_preds = lm_model.predict(X_train)

#Score using your model
test_score = r2_score(y_test, y_test_preds)
train_score = r2_score(y_train, y_train_preds)

#sklearn.metrics.mean_absolute_percentage_error(y_test, y_test_preds, sample_weight=None, multioutput='uniform_average')

In [None]:
test_score

In [None]:
train_score

In [None]:
def find_optimal_lm_mod(X, y, cutoffs, test_size = .30, random_state=42, plot=True):
    '''
    INPUT
    X - pandas dataframe, X matrix
    y - pandas dataframe, response variable
    cutoffs - list of ints, cutoff for number of non-zero values in dummy categorical vars
    test_size - float between 0 and 1, default 0.3, determines the proportion of data as test data
    random_state - int, default 42, controls random state for train_test_split
    plot - boolean, default 0.3, True to plot result

    OUTPUT
    r2_scores_test - list of floats of r2 scores on the test data
    r2_scores_train - list of floats of r2 scores on the train data
    lm_model - model object from sklearn
    X_train, X_test, y_train, y_test - output from sklearn train test split used for optimal model
    '''
    r2_scores_test, r2_scores_train, num_feats, results = [], [], [], dict()
    for cutoff in cutoffs:

        #reduce X matrix
        reduce_X = X.iloc[:, np.where((X.sum() > cutoff) == True)[0]]
        num_feats.append(reduce_X.shape[1])

        #split the data into train and test
        X_train, X_test, y_train, y_test = train_test_split(reduce_X, y, test_size = test_size, random_state=random_state)

        #fit the model and obtain pred response
        lm_model = LinearRegression(normalize=True)
        lm_model.fit(X_train, y_train)
        y_test_preds = lm_model.predict(X_test)
        y_train_preds = lm_model.predict(X_train)

        #append the r2 value from the test set
        r2_scores_test.append(r2_score(y_test, y_test_preds))
        r2_scores_train.append(r2_score(y_train, y_train_preds))
        results[str(cutoff)] = r2_score(y_test, y_test_preds)

    if plot:
        plt.plot(num_feats, r2_scores_test, label="Test", alpha=.5)
        plt.plot(num_feats, r2_scores_train, label="Train", alpha=.5)
        plt.xlabel('Number of Features')
        plt.ylabel('Rsquared')
        plt.title('Rsquared by Number of Features')
        plt.legend(loc=1)
        plt.show()

    best_cutoff = max(results, key=results.get)

    #reduce X matrix
    reduce_X = X.iloc[:, np.where((X.sum() > int(best_cutoff)) == True)[0]]
    num_feats.append(reduce_X.shape[1])

    #split the data into train and test
    X_train, X_test, y_train, y_test = train_test_split(reduce_X, y, test_size = test_size, random_state=random_state)

    #fit the model
    lm_model = LinearRegression(normalize=True)
    lm_model.fit(X_train, y_train)

    return r2_scores_test, r2_scores_train, lm_model, X_train, X_test, y_train, y_test

In [None]:
r2_scores_test

In [None]:
r2_scores_train

In [None]:
cutoffs = [10000,5000, 3500, 2500, 1000, 100, 50, 30, 25,10,5]
find_optimal_lm_mod(X, y, cutoffs, test_size = .30, random_state=42, plot=True)

In [None]:
X_train

In [None]:
r2_scores_train

# start again

In [None]:
df_listing=pd.read_csv("listings.csv")
#df_listing.columns

In [None]:
def display_nans(df,cut_off,display=True):
    '''
    INPUT : Dataframe
            cut_off decimal to remove columns with less than this value of Nan
    
    OUTPUT: Graph with % of nan values in each column

    '''
    
    global df_nan
    df_nan = pd.DataFrame(df.isna().sum() / df.shape[0]).reset_index()
    df_nan.rename(columns={'index':'column',0:'na_per'},inplace=True)
    
    # sort df by Count column
    df_nan = df_nan.sort_values(['na_per'],ascending=False).reset_index(drop=True)
    
    # Remove fll columns
    df_nan=df_nan[df_nan["na_per"]>cut_off]
    
    if display ==True:
        base_color = sns.color_palette()[0]
        plt.figure(figsize=(16,16))
        plt.title('Percentage of missing values by column')
        sns.barplot(data=df_nan,y='column', x='na_per', color=base_color)
    
    return df_nan

In [None]:
def clean_df(df,target,exclusion_list=[],dollar_cols=[],bool_cols=[],percentage_cols=[]):
    
        global df_listing_num,df_listing_cat
        #Drop rows where the target has missing values
        df=df.dropna(subset=[target], axis=0)
        
        #drop exclusion list
        df=df.drop(exclusion_list, axis=1,errors="ignore")
        
        #expand concatenated columns
        ######
        
        #
        for col in dollar_cols:
            try :df[col]=df[col].str.replace("[$, ]", "")  
            except: continue
            df[col]=df[col].astype("float")
            
        # text bool to bool type

        dic={'t': True, 'f': False}
        for col in bool_cols:
            try:df[col]=df[col].replace(dic).astype("bool")
            except: continue
        
        for col in percentage_cols:
            try:df[col]=df[col].str.replace("[%, ]", "")  
            except:continue
            df[col]=df[col].astype("float")
            df[col] = df[col].div(100)
        
        display_nans(df,0.3,False)
        nan_cols=df_nan["column"]
        df=df.drop(nan_cols,axis=1,errors="ignore")
        
        numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
        df_listing_num = df.select_dtypes(include=numerics)
        # Mean function
        fill_mean = lambda col: col.fillna(col.mean())
        # Fill the mean
        df_listing_num = df_listing_num.apply(fill_mean, axis=0)

        #Pull a list of the column names of the categorical variables
        cat_df = df.select_dtypes(include=['object'])
        cat_cols = cat_df.columns

        df_listing_cat=df.copy()

        for col in  cat_cols:
            df_listing_cat = pd.concat([df_listing_cat.drop(col, axis=1), pd.get_dummies(df_listing_cat[col], prefix=col, prefix_sep='_', drop_first=True, dummy_na=True)], axis=1)

        # Mean function
        fill_mean = lambda col: col.fillna(col.mean())
        # Fill the mean
        df_listing_cat = df_listing_cat.apply(fill_mean, axis=0)


### Base line Linear Regression Model

In [None]:
baseline_cols=['neighbourhood_group_cleansed','property_type','accommodates',
       'price','number_of_reviews','review_scores_rating']

exclusion_list=df_listing.columns.drop(baseline_cols)

dollar_cols=["cleaning_fee","extra_people","monthly_price","price","security_deposit","weekly_price"]

bool_cols=["host_is_superhost","host_has_profile_pic","host_identity_verified","instant_bookable",
"is_location_exact","require_guest_phone_verification","require_guest_profile_picture"]

percentage_cols=["host_response_rate"]

In [None]:
df_listing_base=df_listing.copy()
clean_df(df_listing_base,"price",exclusion_list,dollar_cols,bool_cols,percentage_cols)

In [None]:
#Split into explanatory and response variables
X = df_listing_cat.drop("price", axis=1)
y = df_listing_cat["price"]    

#Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

lm_model = LinearRegression(normalize=True) # Instantiate
lm_model.fit(X_train, y_train) #Fit

#Predict using your model
y_test_preds = lm_model.predict(X_test)
y_train_preds = lm_model.predict(X_train)

#Score using your model
test_score = r2_score(y_test, y_test_preds)
train_score = r2_score(y_train, y_train_preds)

print("test score:",test_score," train score:",train_score)
#sklearn.metrics.mean_absolute_percentage_error(y_test, y_test_preds, sample_weight=None, multioutput='uniform_average')

In [None]:
exclusion_list=["experiences_offered","neighbourhood","neighbourhood_cleansed","city","state","market",
                "smart_location","country_code","country","has_availability","calendar_last_scraped",
                "requires_license","jurisdiction_names","license","host_acceptance_rate","last_review",
                "first_review","calendar_updated","host_neighbourhood","zipcode","bed_type","id",
                "listing_url","scrape_id","last_scraped","thumbnail_url","medium_url","picture_url",
                "xl_picture_url","host_id","host_url","host_name","host_since","host_location",
                "host_thumbnail_url","host_picture_url","host_total_listings_count","latitude",
                "longitude","name","summary","space","description","neighborhood_overview","notes",
                "transit","host_about","street","host_verifications","amenities","host_response_rate",
                "host_is_superhost","host_listings_count","host_has_profile_pic","host_identity_verified",
                "is_location_exact","minimum_nights","maximum_nights"]

dollar_cols=["cleaning_fee","extra_people","monthly_price","price","security_deposit","weekly_price"]

bool_cols=["host_is_superhost","host_has_profile_pic","host_identity_verified","instant_bookable",
"is_location_exact","require_guest_phone_verification","require_guest_profile_picture"]

percentage_cols=["host_response_rate"]

#df_listing,"price",exclusion_list=[],dollar_cols=[],bool_cols=[],percentage_cols=[]

In [None]:
df_listing_original=df_listing.copy()
clean_df(df_listing_original,"price",exclusion_list,dollar_cols,bool_cols,percentage_cols)

In [None]:
df_listing_cat.head()

In [None]:
df_listing_num.head()

In [None]:
#orr = df_listing_original.corr()

def get_redundant_pairs(df):
    '''Get diagonal and lower triangular pairs of correlation matrix'''
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

def get_top_abs_correlations(df, n=5):
    global au_corr
    au_corr = df.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(df)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return au_corr[0:n]

print("Top Absolute Correlations")
print(get_top_abs_correlations(df_listing_num, 10))

In [None]:
corr_features=["availability_90","availability_60","beds"]

df_listing_cat=df_listing_cat.drop(corr_features,axis=1)

In [None]:
df_listing_cat.head()

In [None]:
#Split into explanatory and response variables
X = df_listing_cat.drop("price", axis=1)
y = df_listing_cat["price"]    

#Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

lm_model = LinearRegression(normalize=True) # Instantiate
lm_model.fit(X_train, y_train) #Fit

#Predict using your model
y_test_preds = lm_model.predict(X_test)
y_train_preds = lm_model.predict(X_train)

#Score using your model
test_score = r2_score(y_test, y_test_preds)
train_score = r2_score(y_train, y_train_preds)

print("test score:",test_score," train score:",train_score)

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
#train RF regressor model
forest = RandomForestRegressor(n_estimators=900, 
                               criterion='mse', 
                               random_state=42, 
                               n_jobs=-1,verbose=1)
forest.fit(X_train, y_train.squeeze())

In [None]:
y_train_preds = forest.predict(X_train)
y_test_preds = forest.predict(X_test)

In [None]:
print('Random Forest R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train, y_train_preds),
        r2_score(y_test, y_test_preds)))

In [None]:
LassoCV

In [None]:
SVR