In [213]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

#downloads nltk packages - needed to run nltk
import nltk
#nltk.download()

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize import WhitespaceTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [214]:
data = pd.read_csv('cleaned_data.csv', index_col=0)

In [215]:
data

Unnamed: 0,price,living_rooms,bedrooms,bathrooms,town_city,outward_code,property_type,desc
394,25567.0,2.0,3.0,3.0,London,W1K,flat,three bedroom apartment in an exceptional high...
395,25567.0,1.0,3.0,3.0,London,W1K,flat,this newly furnished three bedroom three bathr...
396,25567.0,1.0,3.0,3.0,London,W1K,flat,this unique apartment comprises double bedroo...
397,25567.0,1.0,3.0,3.0,London,W1K,flat,this modern three bedroom lateral apartment is...
398,25567.0,1.0,3.0,3.0,London,W1K,flat,a modern newly refurbished bedroom bathroom ...
...,...,...,...,...,...,...,...,...
8163,19500.0,4.0,7.0,6.0,London,W8,town house,family house in the heart of kensington equidi...
8164,19500.0,1.0,4.0,4.0,London,SW3,maisonette,a truly stunning and unique four bedroom apart...
8165,19500.0,4.0,8.0,6.0,London,W10,detached house,immaculately designed with fastidious attentio...
8166,19500.0,1.0,3.0,3.0,London,W11,property,stylishly designed new build house in a very q...


In [216]:
#creating a bag of words model for the 'desc' column to hopefully make a meaningful new feature. Have words like:
#furnished
#refurbished
#interior designed
#amenities
#short let
#ideally/perfectly/etc located
#in the heart of
#garden (included)
#wifi (included)
#etc

In [217]:
vect = CountVectorizer()

In [218]:
#vect.fit()

In [219]:
bag_of_words = vect.fit_transform(data['desc'])

In [220]:
print(bag_of_words)

  (0, 4016)	1
  (0, 377)	1
  (0, 159)	1
  (0, 2010)	2
  (0, 137)	1
  (0, 1374)	1
  (0, 1870)	1
  (0, 3704)	1
  (0, 2210)	1
  (0, 1096)	1
  (0, 3985)	2
  (0, 1845)	1
  (0, 2776)	2
  (0, 2741)	1
  (0, 2515)	1
  (0, 2369)	1
  (0, 1193)	1
  (0, 3817)	1
  (0, 2093)	1
  (0, 907)	1
  (0, 2368)	1
  (0, 1566)	1
  (0, 1759)	1
  (0, 3689)	1
  (0, 1771)	1
  :	:
  (7773, 2093)	1
  (7773, 2368)	1
  (7773, 4007)	1
  (7773, 138)	1
  (7773, 4466)	1
  (7773, 2798)	1
  (7773, 2383)	1
  (7773, 2640)	1
  (7773, 2273)	1
  (7773, 360)	1
  (7773, 3690)	1
  (7773, 226)	1
  (7773, 4275)	1
  (7773, 708)	1
  (7773, 1080)	1
  (7773, 1534)	1
  (7773, 2810)	1
  (7773, 3381)	1
  (7773, 2196)	1
  (7773, 1099)	1
  (7773, 3975)	1
  (7773, 3616)	1
  (7773, 2806)	1
  (7773, 4345)	1
  (7773, 3552)	1


In [221]:
#instance and count of each word
vect.vocabulary_

{'three': 4016,
 'bedroom': 377,
 'apartment': 159,
 'in': 2010,
 'an': 137,
 'exceptional': 1374,
 'high': 1870,
 'specification': 3704,
 'landmark': 2210,
 'development': 1096,
 'the': 3985,
 'heart': 1845,
 'of': 2776,
 'north': 2741,
 'mayfair': 2515,
 'location': 2369,
 'duke': 1193,
 'street': 3817,
 'is': 2093,
 'conveniently': 907,
 'located': 2368,
 'for': 1566,
 'green': 1759,
 'spaces': 3689,
 'grosvenor': 1771,
 'this': 4007,
 'newly': 2717,
 'furnished': 1643,
 'bathroom': 333,
 'penthouse': 2977,
 'on': 2796,
 'third': 4000,
 'fourth': 1597,
 'and': 138,
 'fifth': 1490,
 'floors': 1549,
 'prominent': 3151,
 'building': 568,
 'corner': 924,
 'audley': 253,
 'unique': 4188,
 'comprises': 850,
 'double': 1159,
 'bedrooms': 380,
 'with': 4466,
 'fitted': 1517,
 'wardrobes': 4329,
 'en': 1277,
 'suites': 3871,
 'large': 2221,
 'reception': 3255,
 'room': 3407,
 'fully': 1635,
 'equipped': 1331,
 'kitchen': 2173,
 'dining': 1113,
 'area': 200,
 'seperate': 3522,
 'utility': 422

In [222]:
#Want to assign weights to certain words and get an overall score for each desc

In [223]:
#sentences = sent_tokenize(data['desc'][1])
sentences = data['desc'].apply(nltk.sent_tokenize)

In [224]:
sentences

394     [three bedroom apartment in an exceptional hig...
395     [this newly furnished three bedroom three bath...
396     [this unique apartment comprises  double bedro...
397     [this modern three bedroom lateral apartment i...
398     [a modern newly refurbished  bedroom  bathroom...
                              ...                        
8163    [family house in the heart of kensington equid...
8164    [a truly stunning and unique four bedroom apar...
8165    [immaculately designed with fastidious attenti...
8166    [stylishly designed new build house in a very ...
8167    [sgort let only this spacious bedroom flat is ...
Name: desc, Length: 7774, dtype: object

In [225]:
words = data['desc'].apply(nltk.word_tokenize)

In [226]:
words

394     [three, bedroom, apartment, in, an, exceptiona...
395     [this, newly, furnished, three, bedroom, three...
396     [this, unique, apartment, comprises, double, b...
397     [this, modern, three, bedroom, lateral, apartm...
398     [a, modern, newly, refurbished, bedroom, bathr...
                              ...                        
8163    [family, house, in, the, heart, of, kensington...
8164    [a, truly, stunning, and, unique, four, bedroo...
8165    [immaculately, designed, with, fastidious, att...
8166    [stylishly, designed, new, build, house, in, a...
8167    [sgort, let, only, this, spacious, bedroom, fl...
Name: desc, Length: 7774, dtype: object

In [227]:
#removing stop-words
stop_words = set(stopwords.words('english'))
print(stop_words)

{"you're", 'shouldn', "hasn't", "aren't", 'do', "she's", "you've", 'these', 'again', "you'll", "shan't", 'were', 'own', 'into', 'few', 'wasn', 's', 're', 'm', 'a', 'aren', 'both', 'down', 'didn', 'their', 'haven', 'are', 'there', 'or', "that'll", 'i', 'with', "don't", 'when', 'be', 'his', "doesn't", 'once', 'weren', 'off', 'any', 'has', 'will', 'whom', 'herself', 'don', 'all', 'its', 'my', 'some', 'should', 'yours', 'each', 'am', 'theirs', "you'd", 'myself', 'below', "it's", 'too', "weren't", 'what', 'from', 'very', 'if', 'further', 'most', 'ourselves', 'such', 'o', 'but', 'in', "won't", 'an', 've', 'which', 'him', 'before', 'those', 'having', 'doesn', 'why', 'nor', 'been', "needn't", 'only', 'we', 'being', 'needn', 'about', 'had', 'hadn', 'as', 'hasn', 'while', 'them', 'up', "isn't", 'our', 'just', 'during', 'against', 'not', 'over', 'and', 'mightn', 'ma', 'until', 't', 'on', 'did', 'can', 'themselves', 'itself', 'then', 'it', 'yourself', 'between', "haven't", 'he', 'than', 'this', 'i

In [228]:
#filter stop words from every item in desc, see which words are most common; build up new stop words list 
filtered_desc = data['desc'].apply(lambda line: [token for token in word_tokenize(line) if token not in stop_words])

In [229]:
filtered_desc

394     [three, bedroom, apartment, exceptional, high,...
395     [newly, furnished, three, bedroom, three, bath...
396     [unique, apartment, comprises, double, bedroom...
397     [modern, three, bedroom, lateral, apartment, s...
398     [modern, newly, refurbished, bedroom, bathroom...
                              ...                        
8163    [family, house, heart, kensington, equidistant...
8164    [truly, stunning, unique, four, bedroom, apart...
8165    [immaculately, designed, fastidious, attention...
8166    [stylishly, designed, new, build, house, quiet...
8167    [sgort, let, spacious, bedroom, flat, located,...
Name: desc, Length: 7774, dtype: object

In [230]:
filtered_desc_words = [w for sent in filtered_desc for w in sent]

In [231]:
filtered_desc_words = pd.DataFrame(filtered_desc_words, columns = ['word'])

In [232]:
word_count = filtered_desc_words.value_counts()

In [233]:
count_df = pd.DataFrame(word_count)
count_df = count_df.reset_index().rename(columns={'index':'words', 0:'freq'})
count_df

Unnamed: 0,word,freq
0,apartment,5451
1,bedroom,4511
2,floor,2574
3,two,2052
4,located,2045
...,...,...
4442,limestone,1
4443,linked,1
4444,linking,1
4445,livedin,1


In [234]:
fig = px.bar(count_df[0:50], 
             x='word', 
             y='freq',
            title='Frequency of the 50 most used words in desc pre-cleanup')
fig.show()

In [235]:
#set new stop words to be the 8 most frequently used words as these aren't meaningful
new_stopwords = count_df['word'][0:8].tolist()
new_stopwords

['apartment',
 'bedroom',
 'floor',
 'two',
 'located',
 'three',
 'property',
 'double']

In [236]:
count_df['word'][0:1000].to_list()

['apartment',
 'bedroom',
 'floor',
 'two',
 'located',
 'three',
 'property',
 'double',
 'spacious',
 'house',
 'situated',
 'room',
 'stunning',
 'reception',
 'available',
 'heart',
 'set',
 'within',
 'refurbished',
 'building',
 'bathroom',
 'bedrooms',
 'large',
 'modern',
 'development',
 'comprises',
 'kitchen',
 'private',
 'sq',
 'let',
 'street',
 'furnished',
 'designed',
 'beautiful',
 'ft',
 'living',
 'flat',
 'high',
 'short',
 'one',
 'floors',
 'park',
 'block',
 'four',
 'space',
 'newly',
 'bathrooms',
 'interior',
 'offers',
 'views',
 'lift',
 'fully',
 'period',
 'location',
 'presented',
 'bills',
 'well',
 'open',
 'st',
 'luxury',
 'benefits',
 'home',
 'new',
 'penthouse',
 'bright',
 'throughout',
 'garden',
 'beautifully',
 'first',
 'family',
 'th',
 'prestigious',
 'rent',
 'ground',
 'plan',
 'wood',
 'standard',
 'close',
 'square',
 'road',
 'offering',
 'offer',
 'accommodation',
 'access',
 'recently',
 'area',
 'portered',
 'mayfair',
 'built',
 'f

In [237]:
#remove other related stop words (and through discovery some others)
new_stopwords.extend([
    'a',
    'aa',
    'ao',
    'ai',
    'aq',
    'gia',
    'area',
    'apartment' #see property_type feature,
    'property',
    'flat',
    'floor',
    'throughout',
    'located',
    'situated',
    'set',
    'within',
    'st',
    'th',
    'double',
    'reception',
    'available',
    'booking',
    'let',
    'rent',
    'air',
    'w',
    'lon',
    'within',
    'development',
    'comprises',
    'street',
    'road',
    'bedroom',
    'bathroom',
    'living',
    'room',
    'm2',
    'm²',
    'ft',
    'sq',
    'sqm',
    'meter',
    'feet',
    'foot',
    'square',
    'interior',
    'exceptional',
    'unique',
    'stylish',
    'luxurious',
    'desirable',
    'stunning'
])

In [238]:
#adding new stop words
all_stopwords = stop_words.union(new_stopwords)

In [239]:
#copying the code above to update the word-frequency
filtered_desc = data['desc'].apply(lambda line: [token for token in word_tokenize(line) if token not in all_stopwords])
filtered_desc_words = [w for sent in filtered_desc for w in sent]
filtered_desc_words_df = pd.DataFrame(filtered_desc_words, columns=['word'])
filtered_desc_words_df
filtered_word_count = filtered_desc_words_df.value_counts()
count_df = pd.DataFrame(filtered_word_count).reset_index(level=0)
count_df.columns=['word', 'freq']
count_df

Unnamed: 0,word,freq
0,spacious,1429
1,house,1387
2,heart,1182
3,refurbished,1100
4,building,1092
...,...,...
4392,linked,1
4393,linking,1
4394,livedin,1
4395,livingbrook,1


In [240]:
fig = px.bar(count_df[0:50], 
             x='word', 
             y='freq',
            title='Frequency of the 50 most used words in desc post-cleanup')
fig.show()

In [241]:
corpus = data['desc']

In [242]:
#tokenising
tokenizer = WhitespaceTokenizer()

In [243]:
#lemmatising
lemmatizer = WordNetLemmatizer()

def lemmatize_df(df):
    return [lemmatizer.lemmatize(w) for w in tokenizer.tokenize(df)]

In [244]:
corpus_lemmatized = corpus.apply(lemmatize_df)

In [245]:
corpus_lemmatized = pd.DataFrame(corpus_lemmatized, columns=['desc'])

In [246]:
corpus_lemmatized['desc']

394     [three, bedroom, apartment, in, an, exceptiona...
395     [this, newly, furnished, three, bedroom, three...
396     [this, unique, apartment, comprises, double, b...
397     [this, modern, three, bedroom, lateral, apartm...
398     [a, modern, newly, refurbished, bedroom, bathr...
                              ...                        
8163    [family, house, in, the, heart, of, kensington...
8164    [a, truly, stunning, and, unique, four, bedroo...
8165    [immaculately, designed, with, fastidious, att...
8166    [stylishly, designed, new, build, house, in, a...
8167    [sgort, let, only, this, spacious, bedroom, fl...
Name: desc, Length: 7774, dtype: object

In [247]:
corpus = corpus_lemmatized.desc.apply(' '.join)

In [248]:
#only want stop words from this model
vectorizer_0 = TfidfVectorizer(stop_words=all_stopwords)

In [249]:
print(vectorizer_0.get_stop_words())

frozenset({"you're", 'shouldn', "hasn't", "aren't", 'area', 'do', 'feet', "she's", "you've", 'these', 'again', 'sqm', "you'll", "shan't", 'were', 'own', 'into', 'few', 'wasn', 's', 're', 'w', 'm', 'a', 'aren', 'both', 'down', 'didn', 'their', 'haven', 'st', 'unique', 'are', 'bedroom', 'there', 'or', "that'll", 'i', 'with', "don't", 'when', 'be', 'his', "doesn't", 'once', 'weren', 'meter', 'off', 'flat', 'any', 'gia', 'has', 'stunning', 'will', 'whom', 'herself', 'ai', 'don', 'all', 'its', 'my', 'some', 'should', 'yours', 'each', 'am', 'square', 'theirs', "you'd", 'lon', 'myself', 'below', 'ft', "it's", 'too', 'reception', "weren't", 'what', 'from', 'very', 'property', 'interior', 'if', 'further', 'most', 'ourselves', 'such', 'o', 'two', 'exceptional', 'but', 'in', "won't", 'an', 've', 'which', 'him', 'before', 'those', 'm²', 'available', 'having', 'doesn', 'why', 'nor', 'been', "needn't", 'booking', 'only', 'we', 'being', 'needn', 'about', 'had', 'hadn', 'as', 'hasn', 'while', 'them', 

In [250]:
tfidfVectorizer_stopwords = vectorizer_0.get_stop_words()

In [251]:
#merge stopwords from all_stopwords and TfidfVectorizer
all_stopwords = all_stopwords.union(tfidfVectorizer_stopwords)

In [252]:
vectorizer = TfidfVectorizer(stop_words=all_stopwords, min_df=0.02, max_df=.80)
#n_grams=(1,1)
X = vectorizer.fit_transform(corpus)

In [253]:
X.shape

(7774, 161)

In [254]:
X[0].T.todense()

matrix([[0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0

In [255]:
#getting top 20 'most important' words
df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
tokens_above_threshold = df.max()[df.max() > 0.3].sort_values(ascending=False)
tokens_above_threshold[0:20]

market        1.000000
terrace       1.000000
garden        1.000000
impressive    1.000000
kitchen       1.000000
balcony       1.000000
short         1.000000
spacious      1.000000
large         1.000000
penthouse     1.000000
high          1.000000
paddington    1.000000
hour          1.000000
modern        1.000000
one           1.000000
benefit       1.000000
viewing       1.000000
walk          0.937745
park          0.929689
ground        0.909148
dtype: float64

In [256]:
#each row in X (the tf–idf score matrix of all words) is a list of the individual scores of all the 
#(non-filtered) words in 'desc' or the corpus. 

#getting the total tf–idf score of each desc
total_tf_idf = X.sum(axis = 1)

In [257]:
total_tf_idf = np.array(total_tf_idf).reshape(-1,).tolist()
total_tf_idf

[2.223047678567487,
 2.4201891942155496,
 2.7966185858435697,
 1.974812615968688,
 2.5573393734479386,
 2.5573393734479386,
 2.977529483213311,
 2.974670724115237,
 3.1188295011896865,
 3.050972516031781,
 2.7902792284121687,
 1.9704119087816612,
 2.603569602900861,
 2.436859490613182,
 3.990996164653599,
 2.61298426659726,
 3.4205541202472385,
 3.990996164653599,
 3.990996164653599,
 3.990996164653599,
 2.398418827864278,
 2.578924554757175,
 3.990996164653599,
 1.4121747246001837,
 2.4247063534510063,
 3.634356143582818,
 2.373758618463174,
 2.4025628315602026,
 2.9578285715386814,
 3.1188295011896865,
 3.050972516031781,
 2.4146051628204157,
 2.9737717421242382,
 3.403519526195138,
 2.7902792284121687,
 2.4338586475071553,
 2.2366389625932572,
 2.8107589411444636,
 2.982951312413804,
 2.441028596706135,
 2.8114177622108136,
 3.0768152334011147,
 2.8038490741309037,
 2.4247063534510063,
 2.217329671699438,
 3.094738653390795,
 3.1407559368129747,
 3.2993184219135707,
 3.9909961646535

In [258]:
total_tf_idf = pd.DataFrame(total_tf_idf, columns=['total_tf_idf_score'])

In [259]:
total_tf_idf

Unnamed: 0,total_tf_idf_score
0,2.223048
1,2.420189
2,2.796619
3,1.974813
4,2.557339
...,...
7769,2.786698
7770,2.439474
7771,3.120883
7772,3.260595


In [260]:
data.dropna(axis=0, inplace=True)

In [261]:
data['total_tf_idf_score'] = total_tf_idf
data

Unnamed: 0,price,living_rooms,bedrooms,bathrooms,town_city,outward_code,property_type,desc,total_tf_idf_score
394,25567.0,2.0,3.0,3.0,London,W1K,flat,three bedroom apartment in an exceptional high...,2.469933
395,25567.0,1.0,3.0,3.0,London,W1K,flat,this newly furnished three bedroom three bathr...,2.597442
396,25567.0,1.0,3.0,3.0,London,W1K,flat,this unique apartment comprises double bedroo...,1.980103
397,25567.0,1.0,3.0,3.0,London,W1K,flat,this modern three bedroom lateral apartment is...,3.126329
398,25567.0,1.0,3.0,3.0,London,W1K,flat,a modern newly refurbished bedroom bathroom ...,3.113628
...,...,...,...,...,...,...,...,...,...
8163,19500.0,4.0,7.0,6.0,London,W8,town house,family house in the heart of kensington equidi...,
8164,19500.0,1.0,4.0,4.0,London,SW3,maisonette,a truly stunning and unique four bedroom apart...,
8165,19500.0,4.0,8.0,6.0,London,W10,detached house,immaculately designed with fastidious attentio...,
8166,19500.0,1.0,3.0,3.0,London,W11,property,stylishly designed new build house in a very q...,


In [262]:
data.corr()

Unnamed: 0,price,living_rooms,bedrooms,bathrooms,total_tf_idf_score
price,1.0,0.286323,0.163791,0.309956,0.035842
living_rooms,0.286323,1.0,0.459421,0.429203,0.022013
bedrooms,0.163791,0.459421,1.0,0.701538,0.015413
bathrooms,0.309956,0.429203,0.701538,1.0,0.015343
total_tf_idf_score,0.035842,0.022013,0.015413,0.015343,1.0


In [263]:
#unfortunately the summed tf-idf scores for each row show little evidence of being correlated with any of the other features or the target