# Data setup and cleaning

In [21]:
import pandas as pd
import numpy as np

In [4]:
# HELPER FUNCTIONS - DATA CLEANING

import re
HMTL_TAG = re.compile('<.*?>') 
WHITESPACE = re.compile(r'\s+')

def replace_html_tags(raw_html, replace_with = ''):
  clean_text = re.sub(HMTL_TAG, replace_with, str(raw_html))
  return clean_text

def replace_whitespace(raw_text, replace_with = ' '):
  clean_text = re.sub(WHITESPACE, replace_with, str(raw_text))
  return clean_text

def clean_text(raw_text):
  clean_text = replace_html_tags(raw_text)
  clean_text = replace_whitespace(clean_text)
  return clean_text

# Clean text for every row in a pandas dataframe column. Returns new dataframe with cleaned text.
def clean_text_batch(df: pd.DataFrame, col: int) -> pd.DataFrame:
  df.iloc[:, col] = df.iloc[:, col].apply(lambda x: clean_text(x))
  return df

In [54]:
# HELPER FUNCTIONS - DATA ANALYSIS / INTERPRETATION

LISTING_FIELDS = ['id','listing_url','scrape_id','last_scraped','source','name','description','neighborhood_overview','picture_url','host_id','host_url','host_name','host_since','host_location','host_about','host_response_time','host_response_rate','host_acceptance_rate','host_is_superhost','host_thumbnail_url','host_picture_url','host_neighbourhood','host_listings_count','host_total_listings_count','host_verifications','host_has_profile_pic','host_identity_verified','neighbourhood','neighbourhood_cleansed','neighbourhood_group_cleansed','latitude','longitude','property_type','room_type','accommodates','bathrooms','bathrooms_text','bedrooms','beds','amenities','price','minimum_nights','maximum_nights','minimum_minimum_nights','maximum_minimum_nights','minimum_maximum_nights','maximum_maximum_nights','minimum_nights_avg_ntm','maximum_nights_avg_ntm','calendar_updated','has_availability','availability_30','availability_60','availability_90','availability_365','calendar_last_scraped','number_of_reviews','number_of_reviews_ltm','number_of_reviews_l30d','first_review','last_review','review_scores_rating','review_scores_accuracy','review_scores_cleanliness','review_scores_checkin','review_scores_communication','review_scores_location','review_scores_value','license','instant_bookable','calculated_host_listings_count','calculated_host_listings_count_entire_homes','calculated_host_listings_count_private_rooms','calculated_host_listings_count_shared_rooms','reviews_per_month']
REVIEW_FIELDS = ['listing_id','id','date','reviewer_id','reviewer_name','comments']

# function to quickly return index of a given string in LISTING_FIELDS
def index_in_listings(field: str) -> int:
    return LISTING_FIELDS.index(field)

# function to quickly return index of a given string in REVIEW_FIELDS
def index_in_reviews(field: str) -> int:
    return REVIEW_FIELDS.index(field)

# function to map a string reading 't' or 'f' to a boolean
def as_bool(t_f: str) -> bool:
    if t_f == 't':
        return True
    elif t_f == 'f':
        return False
    else:
        raise ValueError('Invalid value for boolean: ' + t_f)

# function to normalise a numpy array to floats between 0 and 1
def normalise(arr: np.ndarray) -> np.ndarray:
    return arr / arr.sum()



In [6]:
# CLEAN LISTINGS TEXT AND SAVE TO NEW CSV
df_listings = pd.read_csv("../data/listings.csv")

# print(df_listings.applymap(lambda x: isinstance(x, str)).all(0))

# Names and indices of columns to clean
COLS_TO_CLEAN = {
    'description': 6,
    'neighborhood_overview': 7,
    'host_about': 14
}

for col in COLS_TO_CLEAN:
    df_listings = clean_text_batch(df_listings, COLS_TO_CLEAN[col])

# df_listings.to_csv('../data/listings_clean.csv', index=False)

In [52]:
# SELECT ONLY TARGET COLUMNS FROM LISTINGS -> SAVE TO NEW CSV

df_listings = pd.read_csv("../data/listings.csv")

# col names: review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value
LISTING_COLS_TO_RETAIN = {
    'id': index_in_listings('id'),
    'review_scores_rating': index_in_listings('review_scores_rating'),
    'review_scores_accuracy': index_in_listings('review_scores_accuracy'),
    'review_scores_cleanliness': index_in_listings('review_scores_cleanliness'),
    'review_scores_checkin': index_in_listings('review_scores_checkin'),
    'review_scores_communication': index_in_listings('review_scores_communication'),
    'review_scores_location': index_in_listings('review_scores_location'),
    'review_scores_value': index_in_listings('review_scores_value')
}

df_listings = df_listings.iloc[:, list(LISTING_COLS_TO_RETAIN.values())]
df_listings.to_csv('../data/listings_targets.csv', index=False)

# NOTE: Some listings do not have some or all of the review scores. We will need to account for this when we train our model.

# print(df_listings.applymap(lambda x: isinstance(x, str)).all(0))


In [56]:
# CLEAN REVIEWS TEXT AND SELECT ONLY KEY COLUMNS -> SAVE TO NEW CSV

df_reviews = pd.read_csv("../data/reviews.csv")
df_reviews = clean_text_batch(df_reviews, 5)

REVIEW_COLS_TO_RETAIN = {
    'listing_id': index_in_reviews('listing_id'),
    'comments': index_in_reviews('comments')
}

df_reviews = df_reviews.iloc[:, list(REVIEW_COLS_TO_RETAIN.values())]

# Sort by listing_id, so reviews for a given listing are grouped together
df_reviews = df_reviews.sort_values(by=['listing_id'])
df_reviews.to_csv('../data/reviews_skeleton.csv', index=False)


In [24]:
comments = df_reviews.iloc[:, 5].values
print(comments.shape)
print(comments[0])

(243183,)
We enjoyed our stay very much. The room was comfortable, neat and clean. There were no problems at all and the host family was very helpful and caring. They helped us planning trips or recommended sights. The house is situated in a calm neighbourhood close the the Luas and different bus lines. There are no negative aspects to mention, it was a very satisfying stay. I would recommend it and stay there again whenever I am in Dublin. 


# Basic Text Pre-Processing


In [47]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

from sklearn.feature_extraction.text import TfidfVectorizer

# df_max - exclude words which appear in too many documents
# df_min - exclude words which appear in too few documents
# Use cross validation to determine best values for these parameters.
# HAVE TO USE MIN_DF, otherwise feature vectors are:
# a) too large for my computer to handle.
# b) too full of zeros to be useful.
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.2, min_df=0.001) # 93,193 features

# A single doc will be a list of sentences, in this case a single review.
# Each sentence will be a list of words, or in this case tokens, which have been stemmed.
vectors = vectorizer.fit_transform(comments)
print(len(vectorizer.get_feature_names_out()))
print(vectors.shape)
print(type(vectors))
print(type(vectors[0]))
print(type(vectors[0][0]))
# print(len(vectors.toarray()))

# norm_vectors = []
# for vector in vectors.toarray():
#     norm_vectors.append(normalise(vector))
# norm_vectors = np.array(norm_vectors)

# from nltk.tokenize import word_tokenize
# from nltk.stem import PorterStemmer
# stemmer = PorterStemmer()
# tokens = word_tokenize("Here's example text, isn't it?")
# stems = [stemmer.stem(token) for token in tokens]
# print(stems)

# tokens = word_tokenize("likes liking liked")
# stems = [stemmer.stem(token) for token in tokens]
# print(stems)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\james\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\james\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


2214
(243183, 2214)
<class 'scipy.sparse._csr.csr_matrix'>
<class 'scipy.sparse._csr.csr_matrix'>
<class 'scipy.sparse._csr.csr_matrix'>


In [48]:
print(vectors[0])

  (0, 1611)	0.10034119226853999
  (0, 1252)	0.2606826520004324
  (0, 1326)	0.2716879035414134
  (0, 1148)	0.2735265929660197
  (0, 292)	0.12506550921867488
  (0, 569)	0.2325298183156135
  (0, 1197)	0.19759874300173133
  (0, 399)	0.11856310154980176
  (0, 1329)	0.2040305194885017
  (0, 310)	0.2364094149157616
  (0, 1779)	0.2093844068652601
  (0, 986)	0.11722260700044745
  (0, 1762)	0.21706501481789367
  (0, 1615)	0.16504067658028237
  (0, 2031)	0.2510637012729492
  (0, 1490)	0.2518556702440787
  (0, 946)	0.1919402492295867
  (0, 322)	0.24929458730293394
  (0, 947)	0.11926036963913877
  (0, 757)	0.1610923727814727
  (0, 1539)	0.23053478573094208
  (0, 1318)	0.23720957484615215
  (0, 416)	0.10996789961582039
  (0, 1675)	0.10958327466086534
  (0, 681)	0.14809332862871688


We are trying to predict the ratings of a listing based on reviews for that listing.

So in order to train the model, we need to group all reviews for a particular listing with the appropriate listing id.

In [None]:
# REFERENCE WORKFLOW ONLY

# Split data into training and test sets
# [Maybe use k-fold to do all of this in a loop?]

# 1) Train model for each target column
# Model 1: review_scores_rating
model1.fit(vectors[train], listings_targets[train][review_scores_rating_column])
# Model 2: review_scores_accuracy
model2.fit(vectors[train], listings_targets[train][review_scores_accuracy_column])
# Model 3: review_scores_cleanliness
model3.fit(vectors[train], listings_targets[train][review_scores_cleanliness_column])
# Model 4: review_scores_checkin
model4.fit(vectors[train], listings_targets[train][review_scores_checkin_column])
# Model 5: review_scores_communication
model5.fit(vectors[train], listings_targets[train][review_scores_communication_column])
# Model 6: review_scores_location
model6.fit(vectors[train], listings_targets[train][review_scores_location_column])
# Model 7: review_scores_value
model7.fit(vectors[train], listings_targets[train][review_scores_value_column])

# 2) Predict for each target column
# Model 1: review_scores_rating
model1.predict(vectors[test])
# Model 2: review_scores_accuracy
model2.predict(vectors[test])
# Model 3: review_scores_cleanliness
model3.predict(vectors[test])
# Model 4: review_scores_checkin
model4.predict(vectors[test])
# Model 5: review_scores_communication
model5.predict(vectors[test])
# Model 6: review_scores_location
model6.predict(vectors[test])
# Model 7: review_scores_value
model7.predict(vectors[test])

# 3) Evaluate each model
# NOTE: [I don't know how to do this yet. Copilot wrote this for me. Might be where we do k-fold cross validation.]
# Model 1: review_scores_rating
model1.score(vectors[test], listings_targets[test][review_scores_rating_column])
# Model 2: review_scores_accuracy
model2.score(vectors[test], listings_targets[test][review_scores_accuracy_column])
# Model 3: review_scores_cleanliness
model3.score(vectors[test], listings_targets[test][review_scores_cleanliness_column])
# Model 4: review_scores_checkin
model4.score(vectors[test], listings_targets[test][review_scores_checkin_column])
# Model 5: review_scores_communication
model5.score(vectors[test], listings_targets[test][review_scores_communication_column])
# Model 6: review_scores_location
model6.score(vectors[test], listings_targets[test][review_scores_location_column])
# Model 7: review_scores_value
model7.score(vectors[test], listings_targets[test][review_scores_value_column])
