In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
from IPython.display import clear_output
# Import necessary library
import pandas as pd
import numpy as np
import nltk
import os
import nltk.corpus
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

path = "/content/drive/My Drive/CA683 Data Mining/Continuous Assignment/InsideAirbnb/data/"
calendar = pd.read_csv(path+"calendar.csv")
listings = pd.read_csv(path+"listings.csv")
reviews = pd.read_csv(path+"reviews.csv")
clear_output()

# Column lists

In [0]:
col_list = list(listings.columns)
# basicly the text information about the host
text_columns=['summary','space','description',"neighborhood_overview","notes","transit","access","interaction","house_rules","host_about"]
# 
host_string_columns = ["host_name","host_since","amenities"]

host_location_columns = ["host_neighbourhood","street","neighbourhood","neighbourhood_cleansed","city","market","host_location",
                         "latitude","longitude"]

host_numeric_columns = ["host_id","host_acceptance_rate","host_response_time","host_response_rate","host_listings_count",
                        "host_total_listings_count","square_feet","price","weekly_price","monthly_price",
                        "security_deposit","cleaning_fee","guests_included","extra_people","number_of_reviews",
                        "number_of_reviews_ltm","calculated_host_listings_count","calculated_host_listings_count_entire_homes",
                        "calculated_host_listings_count_private_rooms","calculated_host_listings_count_shared_rooms",
                        "reviews_per_month"]

host_date_columns = ["first_review"]

host_nights_columns = col_list[67:75]

host_category_columns = ["host_is_superhost","host_has_profile_pic","host_identity_verified","property_type","room_type",
                         "instant_bookable","is_business_travel_ready","require_guest_profile_picture","require_guest_phone_verification"]

host_score_columns = ["review_scores_rating","review_scores_accuracy","review_scores_cleanliness","review_scores_checkin",
                      "review_scores_communication","review_scores_location","review_scores_value"
                      ]
host_other_columns = ["host_verifications",]

# Basic Features
+ Number of words
+ Number of characters
+ Average word length
+ Number of stopwords
+ Number of special characters
+ Number of numerics
+ Number of uppercase words

In [0]:
target = text_columns[0]
df_text = listings[['id']+text_columns]
df = df_text[['id',target]]

In [0]:
df = df.fillna('')

In [0]:
df[target]

0       Our house was built in 1937 when there was ple...
1                                                        
2       Our Cottage is a charming light filled cottage...
3       Welcoming victorian house. Breakfast included....
4       washing can be done at a cost of five euro per...
                              ...                        
9138    Tastefully decorated 3 bedroom apartment in th...
9139    Fully equipped 1 bedroom apartment in O'Connel...
9140           Spacious 3 Bedroom Apartment in Temple Bar
9141                 Fully equipped apartment in Dublin 2
9142                                                     
Name: summary, Length: 9143, dtype: object

# Number of Word

In [0]:
len(df[target].iloc[0].split(' '))

100

In [0]:
# df[target+'word_count'] = df[t]
df[target+'_wordCount'] = df[target].apply(lambda x: len(str(x).split(' ')))
df.head(3)

Unnamed: 0,id,summary,summary_wordCount
0,44077,Our house was built in 1937 when there was ple...,100
1,85148,,1
2,85156,Our Cottage is a charming light filled cottage...,99


# Number of Character

In [0]:
import re

In [0]:
def num_of_char(x):
  if(type(x)==np.float):
    return 0
  return len(re.sub(r"\s+", "",str(x), flags=re.UNICODE))

In [0]:
df[target+'_charNum'] = df[target].apply(num_of_char)
clear_output()
df.head()

Unnamed: 0,id,summary,summary_wordCount,summary_charNum
0,44077,Our house was built in 1937 when there was ple...,100,400
1,85148,,1,0
2,85156,Our Cottage is a charming light filled cottage...,99,402
3,121030,Welcoming victorian house. Breakfast included....,49,201
4,159889,washing can be done at a cost of five euro per...,32,135


# Average Word length

In [0]:
def avg_word(x):
  words = str(x).split()
  try:
    return (sum(len(word) for word in words)/len(words))
  except:
    return 0

In [0]:
avg_word("hello world tom")

4.333333333333333

In [0]:
df[target+'_avg_word_len'] = df[target].apply(avg_word)
clear_output()
df.head()

Unnamed: 0,id,summary,summary_wordCount,summary_charNum,summary_avg_word_len
0,44077,Our house was built in 1937 when there was ple...,100,400,4.081633
1,85148,,1,0,0.0
2,85156,Our Cottage is a charming light filled cottage...,99,402,4.1875
3,121030,Welcoming victorian house. Breakfast included....,49,201,4.568182
4,159889,washing can be done at a cost of five euro per...,32,135,4.21875


# Num of Stop Word

In [0]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')
clear_output()

In [0]:
df[target+'_stopwords'] = df[target].apply(lambda x: len([x for x in x.split() if x in stop]))
df.head(3)

Unnamed: 0,id,summary,summary_wordCount,summary_charNum,summary_avg_word_len,summary_stopwords
0,44077,Our house was built in 1937 when there was ple...,100,400,4.081633,46
1,85148,,1,0,0.0,0
2,85156,Our Cottage is a charming light filled cottage...,99,402,4.1875,35


# Num of Numerics


In [0]:
df[target+'_num']=df[target].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
df.head()

Unnamed: 0,id,summary,summary_wordCount,summary_charNum,summary_avg_word_len,summary_stopwords,summary_num
0,44077,Our house was built in 1937 when there was ple...,100,400,4.081633,46,3
1,85148,,1,0,0.0,0,0
2,85156,Our Cottage is a charming light filled cottage...,99,402,4.1875,35,5
3,121030,Welcoming victorian house. Breakfast included....,49,201,4.568182,3,4
4,159889,washing can be done at a cost of five euro per...,32,135,4.21875,13,0


# punctuation_count

In [0]:
import string
df['punctuation_count'] = df[target].apply(lambda x: len("".join(_ for _ in str(x) if _ in string.punctuation))) 
df.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,id,summary,punctuation_count
0,44077,Our house was built in 1937 when there was ple...,8
1,85148,,0


# Upper Case Word Count

In [0]:
df['upper_case_word_count'] = df[target].apply(lambda x: len([wrd for wrd in str(x).split() if wrd.isupper()]))
df.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,id,summary,punctuation_count,upper_case_word_count
0,44077,Our house was built in 1937 when there was ple...,8,1
1,85148,,0,0


#Speech Tags
+ Noun Count
+ Verb Count
+ Adjective Count
+ Adverb Count
+ Pronoun Count

In [0]:
import textblob
pos_family = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}
def check_pos_tag(x, flag):
    cnt = 0
    x = str(x)
    try:
        wiki = textblob.TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_family[flag]:
                cnt += 1
    except:
        pass
    return cnt

In [0]:
df_combined['noun_count'] = df_combined['text'].apply(lambda x: check_pos_tag(x, 'noun'))
df.head(2)

Unnamed: 0,id,summary,punctuation_count,upper_case_word_count,noun_count
0,44077,Our house was built in 1937 when there was ple...,8,1,20
1,85148,,0,0,1


# Feature Extraction Total

In [0]:
# basic_feature_columns = ['_wordCount','_charNum','_avgWordLen','_stopWords','_num']
def Basic_Feature_extraction(df,target):
  df['wordCount'] = df[target].apply(lambda x: len(str(x).split(' ')))
  df['charNum'] = df[target].apply(num_of_char)
  df['avgWordLen'] = df[target].apply(avg_word)
  df['stopWords'] = df[target].apply(lambda x: len([x for x in str(x).split() if x in stop]))
  df['numeric_num']=df[target].apply(lambda x: len([x for x in str(x).split() if x.isdigit()]))
  # df['punctuation_count'] = df[target].apply(lambda x: len("".join(_ for _ in str(x) if _ in string.punctuation))) 
  # df['upper_case_word_count'] = df[target].apply(lambda x: len([wrd for wrd in str(x).split() if wrd.isupper()]))
  # speech tags
  df['noun_count'] = df[target].apply(lambda x: check_pos_tag(x, 'noun'))
  df['verb_count'] = df[target].apply(lambda x: check_pos_tag(x, 'verb'))
  df['adj_count'] = df[target].apply(lambda x: check_pos_tag(x, 'adj'))
  df['adv_count'] = df[target].apply(lambda x: check_pos_tag(x, 'adv'))
  df['pron_count'] = df[target].apply(lambda x: check_pos_tag(x, 'pron'))
  # df.drop(columns=['id'])
  return df

In [0]:
target = text_columns[1]
df = df_text[['id',target]]
df = df.fillna('')
df_A = Basic_Feature_extraction(df,target)

In [0]:
# df_Basic_Features = pd.DataFrame()
# df_Basic_Features['id'] = df_text['id']
# for target in text_columns:
#   df = df_text[[target]]
#   df = df.fillna('')
#   df_res = Basic_Feature_extraction(df,target)
#   df_Basic_Features = pd.concat([df_Basic_Features,df_res],axis = 1)
# df_Basic_Features.head()

## Apply on df_combined.csv

In [0]:
df_combined = pd.read_csv('/content/drive/My Drive/CA683 Data Mining/Continuous Assignment/DataMining_Assignment/Data/text_combined_lemm_rcommon.csv',index_col=0)
print(df_combined.shape)
df_combined.head()

(9145, 5)


Unnamed: 0,id,text,rstop,lemmatization,rcommon
0.0,44077,our house was built in 1937 when there was ple...,house built 1937 plenty land originally dundru...,house built 1937 plenty land originally dundru...,built 1937 plenty land originally dundrum cons...
1.0,85148,im renting a double room in my house in the ...,im renting double room house south dublin city...,im renting double room house south dublin city...,renting kingsize wooden showerbath approx 150 ...
2.0,85156,our cottage is a charming light filled cottage...,cottage charming light filled cottage vibrant ...,cottage charming light filled cottage vibrant ...,cottage charming light filled cottage vibrant ...
3.0,121030,welcoming victorian house breakfast included d...,welcoming victorian house breakfast included d...,welcoming victorian house breakfast included d...,welcoming victorian included desk wooden beach...
4.0,159889,washing can be done at a cost of five euro per...,washing done cost five euro per load light bre...,washing done cost five euro per load light bre...,washing done cost five euro per load light ser...


### Features for raw text

In [0]:
df_raw_text = df_combined[['id','text']]
df_raw_text.head()

Unnamed: 0,id,text
0.0,44077,our house was built in 1937 when there was ple...
1.0,85148,im renting a double room in my house in the ...
2.0,85156,our cottage is a charming light filled cottage...
3.0,121030,welcoming victorian house breakfast included d...
4.0,159889,washing can be done at a cost of five euro per...


In [0]:
target='text'
df_raw_text_feature = Basic_Feature_extraction(df_raw_text,target)
df_raw_text_feature.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,id,text,wordCount,charNum,avgWordLen,stopWords,numeric_num,punctuation_count,upper_case_word_count,noun_count,verb_count,adj_count,adv_count,pron_count
0.0,44077,our house was built in 1937 when there was ple...,863,3460,4.282178,395,20,0,0,202,143,72,47,82
1.0,85148,im renting a double room in my house in the ...,364,1459,4.144886,125,6,0,0,136,51,40,18,14
2.0,85156,our cottage is a charming light filled cottage...,1097,4405,4.268411,487,34,0,0,280,178,84,47,78
3.0,121030,welcoming victorian house breakfast included d...,708,2735,4.181957,260,23,0,0,203,102,61,37,36
4.0,159889,washing can be done at a cost of five euro per...,408,1811,4.471605,179,6,0,0,127,70,29,20,15


In [0]:
df_raw_text.drop(columns=['punctuation_count','upper_case_word_count']).to_csv(save_path+'raw_text_features.csv')

In [0]:
save_path = '/content/drive/My Drive/CA683 Data Mining/Continuous Assignment/DataMining_Assignment/Data/'
df_raw_text_feature.to_csv(save_path+'raw_text_features.csv')

### apply on cleaned text

In [0]:
target = 'rcommon'
df_clean_text = df_combined[['id',target]]
df_clean_text.head(2)

Unnamed: 0,id,rcommon
0.0,44077,built 1937 plenty land originally dundrum cons...
1.0,85148,renting kingsize wooden showerbath approx 150 ...


In [0]:
df_clean_text_feature = Basic_Feature_extraction(df_clean_text,target)
df_clean_text_feature.head()

Unnamed: 0,id,rcommon,wordCount,charNum,avgWordLen,stopWords,numeric_num,noun_count,verb_count,adj_count,adv_count,pron_count
0.0,44077,built 1937 plenty land originally dundrum cons...,220,1358,6.172727,0,8,96,37,44,19,0
1.0,85148,renting kingsize wooden showerbath approx 150 ...,136,628,4.617647,1,3,74,15,26,5,0
2.0,85156,cottage charming light filled cottage vibrant ...,268,1585,5.914179,0,17,129,42,43,21,0
3.0,121030,welcoming victorian included desk wooden beach...,203,1109,5.463054,0,8,92,36,42,8,0
4.0,159889,washing done cost five euro per load light ser...,120,697,5.808333,0,0,67,23,14,4,0


In [0]:
save_path = '/content/drive/My Drive/CA683 Data Mining/Continuous Assignment/DataMining_Assignment/Data/'
df_clean_text_feature.to_csv(save_path+'raw_clean_features.csv')

# Apply on Reviews

## language_detection

In [0]:
!pip install langdetect
from langdetect import detect
clear_output()

In [0]:
def language_detection(text):
  try:
    return detect(text)
  except:
    print(text)
    return None

In [0]:
df_sample = reviews.sample(5000)
df_sample['language'] = df_sample['comments'].apply(language_detection)
df_sample.head(3)

.
10/10
.
nan
:)
.
.
.
.
5/5


Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,language
200191,18515537,165614689,2017-07-01,37904804,Mark John,"The apartment is near to everything you need, ...",en
3988,386860,285324101,2018-07-03,12428223,Caroline,My sister & I stayed at Sinead’s place for a s...,en
207971,18958782,207963661,2017-10-30,127004744,Lisa,"This apartment is in a great location, close ...",en


In [0]:
df_en = df_sample[df_sample['language']=='en']
df_en.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,language
200191,18515537,165614689,2017-07-01,37904804,Mark John,"The apartment is near to everything you need, ...",en
3988,386860,285324101,2018-07-03,12428223,Caroline,My sister & I stayed at Sinead’s place for a s...,en
207971,18958782,207963661,2017-10-30,127004744,Lisa,"This apartment is in a great location, close ...",en
79728,7759045,69552506,2016-04-11,18906859,Johannes,"Berry, Rachel and Jerry are fantastic people! ...",en
167240,15888317,151324467,2017-05-12,2997003,Sasha,Gréât little place in a busy street but facing...,en


In [0]:
target = 'comments'
df = df_en[[target]]
df = df.fillna('')
df_Basic_Features_comments = Basic_Feature_extraction(df,target)

In [0]:
df_Basic_Features_comments['id']=df_en['id']
df_Basic_Features_comments['listing_id'] = df_en['listing_id']
df_Basic_Features_comments.head()

Unnamed: 0,comments,comments_wordCount,comments_charNum,comments_avgWordLen,comments_stopWords,comments_num,id,listing_id
200191,"The apartment is near to everything you need, ...",32,141,4.7,11,0,165614689,18515537
3988,My sister & I stayed at Sinead’s place for a s...,71,342,4.816901,28,0,285324101,386860
207971,"This apartment is in a great location, close ...",96,401,4.406593,46,1,207963661,18958782
79728,"Berry, Rachel and Jerry are fantastic people! ...",103,499,4.84466,34,0,69552506,7759045
167240,Gréât little place in a busy street but facing...,27,132,4.888889,9,0,151324467,15888317


In [0]:
df_Basic_Features_comments['comments'].iloc[0]

'The apartment is near to everything you need, pubs, places of interest, shopping malls and restaurants.  Brian is very helpful and always a message away to lend us a hand. '