In [7]:
import pandas as pd
import numpy as np
from decimal import Decimal
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
 
data = pd.read_csv("/Users/ksomes/Downloads/listings.csv") 

#data cleaning from LAB1
#drop redundant info and fields not useful for analysis
sub=data.drop(['id','listing_url','scrape_id','last_scraped','summary','space','description','experiences_offered'
              , 'neighborhood_overview', 'notes', 'transit', 'access', 'interaction', 'house_rules',
              'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url', 'host_url', 'host_thumbnail_url',
              'host_picture_url', 'country_code', 'country','amenities', 'minimum_minimum_nights',
              'maximum_minimum_nights','minimum_maximum_nights', 'maximum_maximum_nights','minimum_nights_avg_ntm',
              'maximum_nights_avg_ntm', 'availability_30', 'availability_365','availability_90','has_availability',
               'calculated_host_listings_count','calculated_host_listings_count_shared_rooms',
               'is_business_travel_ready','host_about', 'host_acceptance_rate', 'host_total_listings_count',
              'jurisdiction_names','license','monthly_price','square_feet','weekly_price', 'requires_license'], axis=1)
def money_to_decimal(x):
    x = x.replace("$", "").replace(",", "").replace(" ", "")
    return float(x)
def rem_percent(x):
    x=x.replace("%","")
    return float(x)/100
def truncate(n):
    return int(n * 1000) / 1000
#converts objects with money values into decimal values to become continous attribute
sub.cleaning_fee = sub.cleaning_fee.astype(str)
sub.extra_people = sub.extra_people.astype(str)
sub.security_deposit = sub.security_deposit.astype(str)
sub.price = sub.price.astype(str)
sub.loc[:,'price'] = sub.loc[:,'price'].apply(money_to_decimal)
sub.loc[:,'cleaning_fee'] = sub.loc[:,'cleaning_fee'].apply(money_to_decimal)
sub.loc[:,'extra_people'] = sub.loc[:,'extra_people'].apply(money_to_decimal)
sub.loc[:,'security_deposit'] = sub.loc[:,'security_deposit'].apply(money_to_decimal)

#imputations
sub['price']=sub.price.mask(sub.price == 0,sub.price.median())
sub.cleaning_fee=sub.cleaning_fee.fillna(sub.cleaning_fee.median())
sub.first_review=sub.first_review.fillna('2019-08-01')
sub['first_review'] =  pd.to_datetime(sub['first_review'],
                              format='%Y-%m-%d')
sub.host_response_rate = sub.host_response_rate.astype(str)
sub.loc[:,'host_response_rate'] = sub.loc[:, 'host_response_rate'].apply(rem_percent)
sub.host_response_rate=sub.host_response_rate.fillna(sub.host_response_rate.median())
sub['host_since'] =  pd.to_datetime(sub['host_since'],
                              format='%Y-%m-%d')
sub.last_review=sub.last_review.fillna('2019-08-01')
sub['last_review'] =  pd.to_datetime(sub['last_review'],
                              format='%Y-%m-%d')
sub.review_scores_accuracy=sub.review_scores_accuracy.fillna(truncate(sub.review_scores_accuracy.median()))
sub.review_scores_checkin=sub.review_scores_checkin.fillna(truncate(sub.review_scores_checkin.median()))
sub.review_scores_cleanliness=sub.review_scores_cleanliness.fillna(truncate(sub.review_scores_cleanliness.median()))
sub.review_scores_communication=sub.review_scores_communication.fillna(truncate(sub.review_scores_communication.median()))
sub.review_scores_location=sub.review_scores_location.fillna(truncate(sub.review_scores_location.median()))
#sub.review_scores_rating=sub.review_scores_rating.fillna(truncate(sub.review_scores_rating.median()))
sub.review_scores_value=sub.review_scores_value.fillna(truncate(sub.review_scores_value.median()))
sub.reviews_per_month=sub.reviews_per_month.fillna(sub.reviews_per_month.median())
sub.security_deposit=sub.security_deposit.fillna(sub.security_deposit.median())


In [54]:
df = sub[~sub['review_scores_rating'].isnull()]
df['perf_score'] = np.where(df['review_scores_rating']==100, 1, 0)
df.perf_score.sum()
df_data=df.drop('perf_score', axis=1)
df['zipcode'] = df['zipcode'].str.extract('(\d+)', expand=False)
df['zipcode'] = df['zipcode'].astype(float)
df_y=df['perf_score']

#create dummy vars
host_loc = pd.get_dummies(df_data['host_location'],drop_first=True)
host_response = pd.get_dummies(df_data['host_response_time'],drop_first=True)
host_neigh = pd.get_dummies(df_data['host_neighbourhood'],drop_first=True)
host_verif = pd.get_dummies(df_data['host_verifications'],drop_first=True)
host_ident = pd.get_dummies(df_data['host_identity_verified'],drop_first=True)
street = pd.get_dummies(df_data['street'],drop_first=True)
neighborhood = pd.get_dummies(df_data['neighbourhood'],drop_first=True)
city = pd.get_dummies(df_data['city'],drop_first=True)
# make into continuous zipcode = pd.get_dummies(x_train['zipcode'],drop_first=True)
market = pd.get_dummies(df_data['market'],drop_first=True)
loc_exact = pd.get_dummies(df_data['is_location_exact'],drop_first=True)
prop_type = pd.get_dummies(df_data['property_type'],drop_first=True)
room_type = pd.get_dummies(df_data['room_type'],drop_first=True)
bed_type = pd.get_dummies(df_data['bed_type'],drop_first=True)
instant = pd.get_dummies(df_data['instant_bookable'],drop_first=True)
cancel = pd.get_dummies(df_data['cancellation_policy'],drop_first=True)

df_data.drop(['host_location','host_response_time','host_neighbourhood','host_verifications',
             'host_identity_verified', 'street', 'neighbourhood', 'city', 'market', 
             'is_location_exact', 'property_type', 'room_type', 'bed_type', 'instant_bookable',
             'cancellation_policy', 'name', 'host_name', 'host_has_profile_pic', 'neighbourhood_cleansed',
             'neighbourhood_group_cleansed', 'smart_location', 'calendar_updated',
             'calendar_last_scraped','require_guest_profile_picture', 'require_guest_phone_verification',
             'host_since', 'first_review', 'last_review', 'state', 'smart_location'],axis=1,inplace=True)

df_data = pd.concat([df_data,host_loc, host_response, host_neigh, host_verif, host_ident,
                    street, neighborhood, city, market, loc_exact, prop_type, room_type,
                    bed_type, instant, cancel],axis=1)

x_train, x_test, y_train, y_test = train_test_split(df_data, df_y, test_size=0.2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [48]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 37760 entries, 0 to 48745
Data columns (total 61 columns):
name                                            37754 non-null object
host_id                                         37760 non-null int64
host_name                                       37746 non-null object
host_since                                      37746 non-null datetime64[ns]
host_location                                   37660 non-null object
host_response_time                              27160 non-null object
host_response_rate                              37760 non-null float64
host_is_superhost                               37746 non-null object
host_neighbourhood                              32903 non-null object
host_listings_count                             37746 non-null float64
host_verifications                              37760 non-null object
host_has_profile_pic                            37746 non-null object
host_identity_verified                    

In [55]:
lr = LogisticRegression()
lr.fit(x_train, y_train)
print(lr.coef_)
print(lr.intercept_)



ValueError: could not convert string to float: 'f'

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30208 entries, 31393 to 476
Columns: 2594 entries, host_id to super_strict_60
dtypes: datetime64[ns](3), float64(19), int64(10), object(3), uint8(2559)
memory usage: 82.0+ MB


0    10018
2    11238
3    10029
4    10016
5    11216
Name: zipcode, dtype: object