In [1]:
import pandas as pd

## Clean the Dataset
1. Cleaned the dataset using excel already.
    - Deleted columns with information of no use
    - Used city as "Melbourne" only
    - Deleted special characters from "amenities" column and replaced spaces with "_"; translation missing: word also deleted; replaced "," with spaces.
    
2. Using python: 
    - Read the clean file.
    - Created column to count amenities.
    - Filled NA values with "0" in beds, bathrooms.
    - Filled NA values with "None" in columns Summary, Description, house_rules.
    

In [2]:
def clean():
    df=pd.read_csv("testing.csv")
# Cleaned data
    df = df[df['amenities'].notna()]
    df["count_amenities"]=0
    for i in range(0,len(df)):
        try:
            df["count_amenities"][i]=len(df["amenities"][i].split())
        except:
            df["count_amenities"][i]=0
    df.drop(["review_scores_rating","review_scores_accuracy"],axis=1,inplace=True)
    df["beds"].fillna(value=0,inplace=True)
    df["bathrooms"].fillna(value=0,inplace=True)
    df["summary"].fillna(value="None",inplace=True)
    df["description"].fillna(value="None",inplace=True)
    df["house_rules"].fillna(value="None",inplace=True)
    df.to_csv("data.csv")


In [3]:
clean()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["count_amenities"][i]=len(df["amenities"][i].split())
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["count_amenities"][i]=0


## Data split to Train, Test

In [4]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.linear_model import LogisticRegression
import pickle
df=pd.read_csv("data.csv")
cols=['accommodates', 'bathrooms','bedrooms', 'beds','total_review','count_amenities','total_review']
X=df[cols]
y=df["total_price"]

def train_data(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=11)
    return X_train, X_test, y_train, y_test

In [5]:
X_train, X_test, y_train, y_test = train_data(X,y)

## Model to predit price
1. Scale the data
2. Create the model 

In [6]:
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredLogarithmicError

In [7]:
def scale_data(X_train, X_test): 
    standard_scaler = StandardScaler()
    x_train_scaled = pd.DataFrame(
          standard_scaler.fit_transform(X_train),
          columns=X_train.columns
      )
    x_test_scaled = pd.DataFrame(
          standard_scaler.transform(X_test),
          columns = X_test.columns
      )
    return x_train_scaled, x_test_scaled
X_train, X_test= scale_data(X_train, X_test)

In [8]:
def model_create():
    model = Sequential([
        Dense(160, kernel_initializer='normal', activation='relu'),
        Dropout(0.2),
        Dense(480, kernel_initializer='normal', activation='relu'),
        Dropout(0.2),
        Dense(256, kernel_initializer='normal', activation='relu'),
        Dense(1, kernel_initializer='normal', activation='linear')
      ])
    model.compile(
        loss=MeanSquaredLogarithmicError(), 
        optimizer=Adam(learning_rate=0.01), 
        metrics=[MeanSquaredLogarithmicError()]
    )
    return model

In [9]:
model=model_create()
model.fit(X_train,y_train)



<keras.callbacks.History at 0x2520b099ac0>

## Predict the price for all the properties

In [10]:
df=pd.read_csv("data.csv")
predict_df=df[cols]
predict_df

standard_scaler = StandardScaler()
predict_df = pd.DataFrame(
          standard_scaler.fit_transform(predict_df),
          columns=cols
      )

In [11]:
df["predicted_price"]=model.predict(predict_df)

In [12]:
try:
    df.drop("Unnamed: 0", axis=1, inplace=True)
    df
except:
    df

## Create new Columns for implementing conditions

In [13]:
df["difference_price"]=df["total_price"]-df["predicted_price"]
cols=['accommodates', 'bathrooms','bedrooms', 'beds','total_review','count_amenities',"total_price","predicted_price","difference_price"]
df[cols]

Unnamed: 0,accommodates,bathrooms,bedrooms,beds,total_review,count_amenities,total_price,predicted_price,difference_price
0,2,1.0,1,1.0,930,40,1130,334.004211,795.995789
1,4,1.5,1,2.0,990,20,7193,308.442596,6884.557404
2,1,2.0,1,1.0,970,13,64,260.856354,-196.856354
3,2,1.0,1,1.0,980,18,500,286.652588,213.347412
4,2,2.0,1,1.0,810,43,1244,254.842117,989.157883
...,...,...,...,...,...,...,...,...,...
7353,3,1.0,1,1.0,0,0,482,152.147293,329.852707
7354,1,1.0,1,1.0,0,0,30,79.851753,-49.851753
7355,1,1.0,1,1.0,0,0,55,79.851753,-24.851753
7356,2,1.0,1,1.0,0,0,510,107.951714,402.048286


In [14]:
df["outcome"]=""
for i in range(len(df)):
    if(df["difference_price"][i]>100):
        df["outcome"][i]="Overpriced"
    elif(df["difference_price"][i]<-100):
        df["outcome"][i]="UnderPriced"
    else:
        df["outcome"][i]="Negotiable"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["outcome"][i]="Overpriced"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["outcome"][i]="UnderPriced"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["outcome"][i]="Negotiable"


In [15]:
df.to_csv("data_predicted.csv")

In [16]:
accepted_df=pd.read_csv("data_predicted.csv")
cols

['accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'total_review',
 'count_amenities',
 'total_price',
 'predicted_price',
 'difference_price']

## Filter the data

In [17]:
temp_df = accepted_df.loc[(accepted_df['is_location_exact'] == 't') & 
                         (accepted_df['total_review'] >= 750) &
                         (accepted_df['outcome'] !="Overpriced" )&
                         (accepted_df['instant_bookable'] =="t" )]

In [18]:
try:
    temp_df.drop(["Unnamed: 0","summary","index"],inplace=True,axis=1)
except:
    print()
#temp_df.drop("summary",inplace=True,axis=1)
accepted_df=temp_df.reset_index()
try:
    temp_df.drop("index",inplace=True,axis=1)
except:
    print()
accepted_df.to_csv("accepted_df.csv")





## Use NLTK to find polarity of description and house

In [20]:
import nltk
import pandas as pd
df=pd.read_csv("accepted_df.csv")
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
 # Download if not already downloaded- nltk.download()
x=['neg','neu','pos','compound']

sent_desc=pd.DataFrame(columns = x)
for i in range(len(df)):
    x=sia.polarity_scores(df["description"][i])
    sent_desc=sent_desc.append(x , ignore_index=True)

sent_house=pd.DataFrame(columns = x)
for i in range(len(df)):
    x=sia.polarity_scores(df["house_rules"][i])
    sent_house=sent_house.append(x , ignore_index=True)
    
    
sent_desc.rename(columns = {'neu':'desc_neu','pos':'desc_pos','neg':'desc_neg','compound':'desc_aggr'}, inplace = True)
sent_house.rename(columns = {'neu':'house_neu','pos':'house_pos','neg':'house_neg','compound':'house_aggr'}, inplace = True)

result = pd.concat([sent_desc, sent_house], axis=1)

data=pd.concat([df, result], axis=1)
try:
    data.drop(["Unnamed: 0","index","Unnamed: 0.1"], inplace=True, axis=1)
    data.to_csv("output.csv")
except:
    data.to_csv("output.csv")

## Inference

1. The cleaned file contains properties with best chance to be chosen.
2. When sorted the file with most positice description, 0 negative house rules. We get id =30019162 as the best house.
3. From the file we can choose the best property according to the customer conditions.