In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from datetime import date

import string

from nltk.probability import FreqDist
from nltk.util import ngrams
from collections import Counter

import statsmodels.api as sm

In [None]:
#SET OUT THE QUESTIONS OF INTEREST
"""
1) Is there a difference in the average price per night of Airbnb's in Seattle and Boston?
2) What are people saying about their stays in Boston and Seattle?
3) When it comes to price, is it really all about location, location, location? Are downtown properties that much more expensive than those further out?
4) What are the most important factors in deciding the price per night of an Airbnb and is city one of them?

"""

In [None]:
# INTRODUCE THE DATA

In [None]:
# Read in the data. You will need to edit these steps for your own machine

Sea_calendar = pd.read_csv('D:\\lori_python\\UDACITY\\Lesson 1 Intro to DS\\Project 1 blog post\\Data\\Seattle data\\calendar.csv')
Sea_listings = pd.read_csv('D:\\lori_python\\UDACITY\\Lesson 1 Intro to DS\\Project 1 blog post\\Data\\Seattle data\\listings.csv')
Sea_reviews = pd.read_csv('D:\\lori_python\\UDACITY\\Lesson 1 Intro to DS\\Project 1 blog post\\Data\\Seattle data\\reviews.csv')

In [None]:
#See comment above
Bos_calendar = pd.read_csv('D:\\lori_python\\UDACITY\\Lesson 1 Intro to DS\\Project 1 blog post\\Data\\Boston data\\calendar.csv')
Bos_listings = pd.read_csv('D:\\lori_python\\UDACITY\\Lesson 1 Intro to DS\\Project 1 blog post\\Data\\Boston data\\listings.csv')
Bos_reviews = pd.read_csv('D:\\lori_python\\UDACITY\\Lesson 1 Intro to DS\\Project 1 blog post\\Data\\Boston data\\reviews.csv')

In [None]:
pd.set_option('display.max_columns', 200)
Sea_listings.head()

In [None]:
Sea_listings.columns

In [None]:
Bos_listings.head()

In [None]:
# Let's get some idea of when the data is from and whether the different data sets all refer to the same pool of properties

Sea_listings['last_scraped'].value_counts()

Bos_listings['last_scraped'].value_counts()

In [None]:

Sea_reviews['date'].min()

Sea_reviews['date'].max()

In [None]:
Bos_reviews['date'].min()

Bos_reviews['date'].max()

In [None]:
#Let's check that the properties in the reviews data sets are the same as in the listings data
Sea_ser1 = Sea_reviews['listing_id']
Sea_ser2 = Sea_listings['id']
Intersect = set(Sea_ser1) & set(Sea_ser2)
print(len(Intersect))

Sea_reviews['listing_id'].nunique() # From this we can conclude that all Seattle reviews relate to properties in the listings data set


In [None]:
Bos_ser1 = Bos_reviews['listing_id']
Bos_ser2 = Bos_listings['id']
BIntersect = set(Bos_ser1) & set(Bos_ser2)
print(len(BIntersect))

Bos_reviews['listing_id'].nunique() # Again can conclude properties are the same in the two data sets

In [None]:
# SECTION 1
# In this part of the notebook I work with the text data in the 'reviews' datasets. The aim here is to understand what people are saying about their stays in each city by finding common words and phrases used in reviews

In [None]:
len(Sea_reviews)

In [None]:
len(Bos_reviews)

In [None]:
np.sum(Sea_reviews.isnull())

In [None]:
np.sum(Bos_reviews.isnull())

In [None]:
Sea_reviews = Sea_reviews.dropna()
Bos_reviews= Bos_reviews.dropna()

In [None]:
#When you look at reviews you often find that there are automated posting when a reservaton is cancelled. Let's remove these
substring = ["automated posting"]
Sea_reviews = Sea_reviews[~Sea_reviews.comments.str.contains('|'.join(substring))]
print(len(Sea_reviews))

In [None]:
Bos_reviews = Bos_reviews[~Bos_reviews.comments.str.contains('|'.join(substring))]
print(len(Bos_reviews))

In [None]:
Sea_reviews.head()

In [None]:
Bos_reviews.head()

In [None]:
Sea_reviews['comments'] = Sea_reviews['comments'].str.lower()
Bos_reviews['comments'] = Bos_reviews['comments'].str.lower()

In [None]:
""" This function takes a text input and prepares it for analysis

Args - text (str): This is a string or collection of strings

Returns - text (str): This is the original text with stopwords removed and word lemmatized


 """

def clean_text(text):

    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    
    # Join the tokens back into a string
    clean_text = ' '.join(lemmatized_tokens)
    
    return clean_text

In [None]:
#Preprocessing

In [None]:
Cleaned_SText = Sea_reviews['comments'].apply(clean_text)
S_reviews = pd.DataFrame({'cleaned text': Cleaned_SText})

S_reviews.head()

In [None]:
#here I tokenize each review and join all of the tokens into one string, before calculating the most common words in that string
all_tokens = []
for review in S_reviews['cleaned text']:
    tokens = nltk.word_tokenize(review)
    all_tokens.extend(tokens)

S_fd = FreqDist(all_tokens)
S_fd.most_common(10)

In [None]:
Seattle_Words = S_fd.most_common(10)

In [None]:
# Here I find the most common two-word phrases in the 'all_tokens' string. In theory the phrases couldf be of an length, however two-word bigrams are generally considered to be the most informative
S_phrases = Counter(list(ngrams(all_tokens, 2)))
S_phrases.most_common(5)

In [None]:
# Repeat the process for Boston properties

In [None]:
Cleaned_BText = Bos_reviews['comments'].apply(clean_text)
B_reviews = pd.DataFrame({'cleaned text': Cleaned_BText})

In [None]:
all_Bostokens = []
for review in B_reviews['cleaned text']:
    tokens = nltk.word_tokenize(review)
    all_Bostokens.extend(tokens)

B_fd = FreqDist(all_Bostokens)
B_fd.most_common(10)

In [None]:
Boston_Words = B_fd.most_common(10)

In [None]:
B_Phrases = Counter(list(ngrams(all_Bostokens,2)))
B_Phrases.most_common(5)

In [None]:
#Bring it all together

In [None]:
Common_Words = pd.DataFrame({'Seattle': Seattle_Words})

In [None]:
Common_Words['Boston'] = Boston_Words

In [None]:
Common_Words = Common_Words.applymap(lambda x: str(x).replace("(", "").replace(")", ""))

In [None]:
Common_Words.style.hide(axis='index')

In [None]:
import plotly.figure_factory as ff
fig =  ff.create_table(Common_Words)
fig.update_layout(
    title={
        'text': "Most common words in reviews, by city",
        'x': 0.5,
        'xanchor': 'center',
        'font': {'size': 20}
    },
    margin=dict(l=50, r=50, t=50, b=50),
    width=500,
    height=550,)

fig.write_image("FinalCommonWords.png", scale=2)
fig.show()

In [None]:
Phrases = pd.DataFrame({'Seattle': S_phrases.most_common(5)})

In [None]:
Phrases['Boston'] = B_Phrases.most_common(5)

In [None]:
"""As it stands a figure made with the Phrases data will put quote marks around every word, whereas we want them around the phrase. 
This function removes quote marks after the first word and before the second word as well as the comma inbetween them.

Args - text (str): this can string data

Returns = text (str): This is the original text with the unwanted quote marks removed


"""


def clean_text(text):
    text= str(text)
    text = text.replace("'", "").replace("(", "").replace(")", "")  # remove all quote marks
    text = text.split(", ")  # split the string into a list
    text = f"'{text[0]} {text[1]}'" + ", " + text[2]  # add back the first and fourth quote marks, and the second comma
    return text

In [None]:
Phrases = Phrases.applymap(clean_text)

In [None]:

fig = ff.create_table(Phrases)
fig.update_layout(
    title={
        'text': "Most common phrases in reviews, by city",
        'x': 0.5,
        'xanchor': 'center',
        'font': {'size': 20}
    },
    margin=dict(l=50, r=50, t=50, b=50),
    width=550,
    height=350,
)
fig.write_image("FinalReviewPhrases.png", scale=2)
fig.show()

In [None]:
#SECTION 2

# This section of the notebook plots the properties on a map of their respective cities, with colour coding to show price band.  
#Before doing this the price data needs to be tidied up, as there are non-numeric characters.

In [None]:

Sea_listings['price'] = Sea_listings['price'].str.replace(',', '')

Sea_listings['price'] = Sea_listings['price'].str.replace('$', '')

Sea_listings['price'] = Sea_listings['price'].str.replace(' ', '')

Sea_listings['price'].head()


In [None]:

Sea_listings['price'] = Sea_listings['price'].astype('float')


In [None]:

(np.sum(Sea_listings['price'].isnull()) / len(Sea_listings['price'])) * 100 # % missing values


In [None]:

Sea_listings['price'].min()

In [None]:

Sea_listings['price'].max()

In [None]:

Sea_listings['price'].mean()

In [None]:

Sea_listings['price_range($)'] = Sea_listings['price'].apply(lambda x:'<50' if x < 50 else '50-100' if x <100 else '100-150' if x <150 
                                                 else '150-200' if x < 200 else '200-250' if x < 250 else '250-300' if x <300
                                                     else '>300')

In [None]:
                                                    
Sea_listings['price_range($)'].value_counts()


In [None]:
plt.figure(figsize=(16,12))
sns.scatterplot(x='longitude', y='latitude', data=Sea_listings,
                hue='price_range($)', hue_order = ['<50', '50-100', '100-150', '150-200', '200-250', '250-300', '>300'], 
                palette='viridis').set(title="Map of Seattle Airbnb listings")
#plt.savefig('Seattle_Map2.png')


In [None]:
# Let's take the count for each neighbourhood to help with interpreting the map

Sea_listings['neighbourhood'].value_counts()


In [None]:
# Boston map
Bos_listings['price'] = Bos_listings['price'].str.replace(',', '')

Bos_listings['price'] = Bos_listings['price'].str.replace('$', '')

Bos_listings['price'] = Bos_listings['price'].str.replace(' ', '')

Bos_listings['price'].head()

Bos_listings['price'] = Bos_listings['price'].astype('float')


In [None]:
(np.sum(Bos_listings['price'].isnull()) / len(Bos_listings['price'])) * 100 # % missing values

In [None]:
Bos_listings['price'].min()

In [None]:
Bos_listings['price'].max()

In [None]:
Bos_listings['price'].mean()

In [None]:
Bos_listings['price_range($)'] = Bos_listings['price'].apply(lambda x:'<50' if x < 50 else '50-100' if x <100 else '100-150' if x <150 
                                                 else '150-200' if x < 200 else '200-250' if x < 250 else '250-300' if x <300
                                                     else '>300')

In [None]:
plt.figure(figsize=(16,12))
sns.scatterplot(x='longitude', y='latitude', data=Bos_listings,
                hue='price_range($)', hue_order = ['<50', '50-100', '100-150', '150-200', '200-250', '250-300', '>300'], 
                palette='viridis').set(title="Map of Seattle Airbnb listings")
# plt.savefig('Boston_Map6.png')

In [None]:
Bos_listings['neighbourhood'].value_counts()
#In Seattle the downtown doesn't command the same premium as in Boston as the lighter dots are spread out, whereas in Boston anything that isn't central is a darker colour

In [None]:
# SECTION 3
# What factors determine the price of stays, is there a difference between the two cities? This section of code will provide a regression model to explore this question

In [None]:
Sea_listings.shape

In [None]:
Bos_listings.shape

In [None]:
a = np.setdiff1d(Sea_listings.columns, Bos_listings.columns)
print (a) # Columns that are in Seattle data but not Boston

In [None]:
b = np.setdiff1d(Bos_listings.columns, Sea_listings.columns)
print (b) # Columns that are in Boston data but not Seattle

In [None]:
Bos_listings = Bos_listings.drop(['access', 'house_rules', 'interaction'], axis=1)

In [None]:
Bos_listings.shape

In [None]:
Sea_listings['Boston_City'] = 0

In [None]:
Sea_listings['Boston_City'].value_counts()

In [None]:
Bos_listings['Boston_City'] = 1

In [None]:
Bos_listings['Boston_City'].value_counts()

In [None]:
# To run a regression we want to combine these into one data set.
# Before doing that, let's check the distributions of price and remove outliers

In [None]:
plt.hist(Sea_listings['price'])

In [None]:
Sea_listings['price'].mean()

In [None]:
Sea_listings['price'].max() # Even with fairly large bins we have some massive outliers

In [None]:
len(Sea_listings['price'])

In [None]:
#Let's remove the highest 10% of values, given the strong rightward skew
3818 * 0.9 # Find out where the 90 percentile would be

In [None]:
Sea_listings['clean_price'] = Sea_listings['price'].sort_values()[:3436]

In [None]:
plt.hist(Sea_listings['clean_price'])# there is a little bit of skew, but this looks a lot more like a normal distribution

In [None]:
#Now for boston data

In [None]:
plt.hist(Bos_listings['price'])

In [None]:
Bos_listings['price'].mean()

In [None]:
Bos_listings['price'].max() # Max value is nowhere near where the data is clustered!

In [None]:
len(Bos_listings)

In [None]:
3585 * 0.9

In [None]:
Bos_listings['clean_price'] = Bos_listings['price'].sort_values()[:3227]

In [None]:
plt.hist(Bos_listings['clean_price']) #Still skewed but nowhere near as extreme

In [None]:
#Let's compare average prices per night now outliers have been removed

In [None]:
Bos_listings['clean_price'].mean()

In [None]:
Sea_listings['clean_price'].mean()

In [None]:
#Let's also take a look at average review scores

In [None]:
Bos_listings['review_scores_rating'].mean()

In [None]:
Sea_listings['review_scores_rating'].mean()

In [None]:
#From all this we can see that with outliers removed Boston has a higher average price per night than Seattle, despite lower average review scores

In [None]:
#Combine the two data sets into one

In [None]:
listings = Sea_listings.append(Bos_listings)

In [None]:
listings.shape

In [None]:
listings.info() #Why is licence in here? Most variable have decent observations.
#Let's pick variables of interest out from the below for modelling, as not all of these are relevant to price

In [None]:
#Before dropping id data, let's see how many unique id's there are
listings['id'].nunique()
#Looks like all rows are unique properties

In [None]:
# Let's only keep variables of interest
listings = listings[['clean_price', 'host_is_superhost', 'room_type', 'bed_type', 'experiences_offered', 'host_has_profile_pic', 'Boston_City', 'is_location_exact', 'property_type','bathrooms', 'bedrooms', 'beds', 'security_deposit', 'cleaning_fee', 'minimum_nights', 'maximum_nights', 'availability_30', 'number_of_reviews', 'review_scores_rating', 'instant_bookable', 'cancellation_policy', 'amenities' ]]

In [None]:
listings.info()

In [None]:
listings.head()

In [None]:
listings['experiences_offered'].value_counts() # looks like a dudd

In [None]:
listings = listings.drop(['experiences_offered'], axis=1)

In [None]:
listings.shape

In [None]:
#Let's start tidying up the data.
# Have binary variables that need to be 1/0, text that needs to be encoded as a scale, punctuation to remove and missing values to impute.

In [None]:
# identify the unique elements in the 'amenities' column and create dummy variables for each 

In [None]:
ammen = listings['amenities'].str.split(",")

ammen

flat_list = []
for sublist in ammen:
    for item in sublist:
        flat_list.append(item)

flat_list

char = '{'
char2 ='}'
char3 = '"'
char4 = "'"

for idx, ele in enumerate(flat_list):
        flat_list[idx] = ele.replace(char, '').replace(char2, "").replace(char3,'').replace(char4,'')

flat_list

unique_amens = set(flat_list)

In [None]:
unique_amens

In [None]:

unique_amens.remove('translation missing: en.hosting_amenity_49')

unique_amens.remove('translation missing: en.hosting_amenity_50')

unique_amens.remove('')

In [None]:

unique_amens

In [None]:

dumms = listings.amenities.str.get_dummies(sep = ",")

dumms.columns = dumms.columns.astype(str).str.replace(r'[""]', "")

dumms


In [None]:

amen_dums = dumms.drop(columns=[col for col in dumms.columns if col not in unique_amens])

amen_dums

In [None]:
amen_dums = amen_dums.reset_index(drop=True)

In [None]:
listings = listings.join(amen_dums)

In [None]:
# Let's get value counts for al of the dummy variables
amen_dums.apply(pd.Series.value_counts)

In [None]:
#Some variables have almost no variation in the data

In [None]:
listings = listings.replace(',','', regex=True)

In [None]:
listings = listings.replace('&','', regex=True)

In [None]:
listings = listings.replace('$','', regex=True)

In [None]:
listings.shape

In [None]:
""" This variable will take a variable encoded as 't' or 'f' and return it as '1' or '0'

Args - col(str): A variable encoded as 't' or 'f' to denote 'true' or 'false'
       
Returns - col(Int): Returns col encoded as '1' in place of 't' and '0' instead of 'f'
"""

def binary(col): 
    return col.map(dict(t=1, f=0))

In [None]:
listings[['instant_bookable', 'host_is_superhost', 'host_has_profile_pic', 'is_location_exact']]=listings[['instant_bookable', 'host_is_superhost', 'host_has_profile_pic',  'is_location_exact']].apply(binary)

In [None]:
listings.head()

In [None]:
listings['property_type'].value_counts()

In [None]:
listings['property_type'] = listings['property_type'].apply(lambda x: 2 if x =='House' else 2 if x == 'Townhouse' else 1 if
                                                           x ==' Apartment' else 1 if x=='Condominium' else 0)

In [None]:
listings['cancellation_policy'].value_counts()

In [None]:
listings['cancellation_policy'] = listings['cancellation_policy'].apply(lambda x: 3 if x =='super_strict_30' else 2 if x =='strict'
                                                        else 1 if x=='moderate' else 0)

In [None]:
listings.head()

In [None]:
listings['room_type'].value_counts()

In [None]:
room = pd.get_dummies(listings['room_type'], prefix='room', drop_first=True)

In [None]:
room = room.reset_index(drop=True)

In [None]:
listings = listings.join(room)

In [None]:
listings['bed_type'].value_counts()

In [None]:
bed = pd.get_dummies(listings['bed_type'], prefix ='bed', drop_first=True)

In [None]:
bed = bed.reset_index(drop=True)

In [None]:
listings = listings.join(bed)

In [None]:
listings.head()

In [None]:
listings.info()

In [None]:
listings['cleaning_fee'].value_counts()

In [None]:
#Security deposit and cleaning fee still have dollar signs. Need to be converted to numeric type before missinga data can be filled

In [None]:
listings['security_deposit']= listings['security_deposit'].str.replace('$','', regex=True)

In [None]:
listings['cleaning_fee']= listings['cleaning_fee'].str.replace('$','', regex=True)

In [None]:
listings['security_deposit'] = listings['security_deposit'].astype('float')

In [None]:
listings['cleaning_fee'] = listings['cleaning_fee'].astype('float')

In [None]:
#Take the average values of 'cleaning_fee' and 'security_deposit' for each number of bedrooms. These will feed into a function that imputes missing data using average grouped by the 'bedrooms' variable
impute_mean_deposit = listings.groupby('bedrooms', dropna=False).mean()['security_deposit'] 
impute_mean_cleaning = listings.groupby('bedrooms', dropna=False).mean()['cleaning_fee']

In [None]:
""" This function is used to impute missing values for the 'security_deposit' variable, using the average value of 'security_deposit' for each value of the 'bedrooms' variable

Args - bedrooms (float): This is the 'bedrooms' varibale
       fee (float): This is the security deposit variable

Returns - fee (float): This is the security deposit variable with missing values imputed

Example - if the average security deposit for a three bedroom property is $20, then for a 3 bedroom property with missing data in the security deposit variable the function will impute a value of $20
"""

def impute_deposit (bedrooms, deposit):
    if np.isnan(deposit):
        return impute_mean_deposit[bedrooms]
    else:
        return deposit

In [None]:
listings['security_deposit'] = listings.apply(lambda x: impute_deposit(x['bedrooms'], x['security_deposit']), axis=1)

In [None]:
""" This function is used to impute missing values for the 'cleaning_fee' variable, using the average value of 'cleaning_fee' for each value of the 'bedrooms' variable

Args - bedrooms (float): This is the 'bedrooms' varibale
       fee (float): This is the cleaning fee variable

Returns - fee (float): This is the cleaning fee variable with missing values imputed

Example - if the average cleaning fee for a three bedroom property is $20, then for a 3 bedroom property with missing data in the cleaning fee variable the function will impute a value of $20
"""

def impute_cleaning (bedrooms,fee):
    if np.isnan(fee):
        return impute_mean_cleaning[bedrooms]
    else:
        return fee

In [None]:
listings['cleaning_fee'] = listings.apply(lambda x: impute_cleaning(x['bedrooms'], x['cleaning_fee']), axis=1)

In [None]:
listings.head()

In [None]:
listings.info()

In [None]:
listings = listings.drop(['room_type', 'bed_type', 'amenities'], axis=1)

In [None]:
listings = listings.dropna()

In [None]:
listings.shape

In [None]:
listings.info() # consistent observations across all variables

In [None]:
Y = listings['clean_price']
X = listings.drop(['clean_price'], axis=1)
X = sm.add_constant(X)

model = sm.OLS(Y.astype('float'), X.astype('float')).fit()

In [None]:
print(model.summary())

In [None]:
# Note possible multicollinearity - let's try some other iterations of model and look out for improvements / consistency

In [None]:
#Firstly let's check out Variance Inflation Factors

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]

In [None]:
pd.set_option('display.max_rows', None)
vif_data

In [None]:
# Based on the Variance Inflation Factors, we can remove some of the variables with the highest multicollinearity
X2 = X.drop(['beds', 'Hair Dryer', '24-Hour Check-in', 'Dryer', 'Laptop Friendly Workspace',  'Pets live on this property', 'Dog(s)', 'Washer', 'Iron', 'Hangers', 'Shampoo', 'bed_Futon'], axis=1)
X2 = sm.add_constant(X2)

model2 = sm.OLS(Y.astype('float'), X2.astype('float')).fit()

In [None]:
print(model2.summary())

In [None]:
second_vif = pd.DataFrame()
second_vif["feature"] = X2.columns
second_vif["VIF"] = [variance_inflation_factor(X2.values, i) for i in range(len(X2.columns))]

In [None]:
second_vif

In [None]:
# Lets try removing variables that are insignificant or could be apriori problematic
# Also, given the high VIF for the constant, let's try removing it
X2 = X.drop(['const','beds', 'Hair Dryer', '24-Hour Check-in', 'Dryer', 'Laptop Friendly Workspace',  'Pets live on this property', 'Dog(s)', 'Washer', 'Iron', 'Hangers', 'Shampoo', 'bed_Futon'], axis=1)
no_cons = sm.OLS(Y.astype('float'), X2.astype('float')).fit()

In [None]:
print(no_cons.summary())

In [None]:
nocons_vif = pd.DataFrame()
nocons_vif["feature"] = X2.columns
nocons_vif['VIF'] = [variance_inflation_factor(X2.values, i) for i in range (len(X2.columns))]

In [None]:
nocons_vif

In [None]:
#Let's try dropping the insignificant variables
X5 = X2.drop(['bathrooms', 'security_deposit', 'maximum_nights', 'Buzzer/Wireless Intercom', 'Carbon Monoxide Detector',
             'Cable TV', 'First Aid Kit', 'Fire Extinguisher', 'Free Parking on Premises', 'Hot Tub', 'Indoor Fireplace',
             'Lock on Bedroom Door', 'Other pet(s)', 'Safety Card', 'Smoke Detector', 'Smoking Allowed', 'Suitable for Events',
             'Wheelchair Accessible', 'Breakfast', 'Cat(s)', 'Gym', 'Heating', 'Internet', 'Kitchen', 'Pool', 'room_Private room',
             'room_Shared room', 'bed_Couch', 'bed_Pull-out Sofa', 'bed_Real Bed'], axis=1)

In [None]:
no_cons2 = sm.OLS(Y.astype('float'), X5.astype('float')).fit()

In [None]:
print(no_cons2.summary())