In [1]:
import pandas
import csv
from pandas.core.frame import DataFrame
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
import string



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/livialilli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/livialilli/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Step 1

In [3]:
rentals_file = pandas.read_csv(("files/Airbnb_Texas_Rentals.csv"), sep =",", delimiter= None, header = "infer", names = None, index_col = None,
                       encoding="ISO-8859-1")

In [4]:
rentals_file.head(3)

Unnamed: 0.1,Unnamed: 0,average_rate_per_night,bedrooms_count,city,date_of_listing,description,latitude,longitude,title,url
0,1,$27,2,Humble,May 2016,Welcome to stay in private room with queen bed...,30.020138,-95.293996,2 Private rooms/bathroom 10min from IAH airport,https://www.airbnb.com/rooms/18520444?location...
1,2,$149,4,San Antonio,November 2010,"Stylish, fully remodeled home in upscale NW â...",29.503068,-98.447688,Unique Location! Alamo Heights - Designer Insp...,https://www.airbnb.com/rooms/17481455?location...
2,3,$59,1,Houston,January 2017,'River house on island close to the city' \nA ...,29.829352,-95.081549,River house near the city,https://www.airbnb.com/rooms/16926307?location...


# Step 2

In [5]:
#creating tsv files
i = 0
for r in range(len(rentals_file)):
    record = rentals_file.loc[[r]]
    name = "doc_" + str(i) +".tsv"
    record.to_csv(path_or_buf = name, sep='\t')
    
    i += 1
    

KeyboardInterrupt: 

In [5]:
#function to read the tsv file
def read(file_name):
    read_file = pandas.read_csv(file_name, sep ="\t", delimiter= None, header = "infer", names = None, index_col = None, usecols = None,
                       encoding="ISO-8859-1")
    return read_file
    

In [6]:
#for now I just consider one tsv file
d = read("files/doc_0.tsv")

In [7]:
d

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,average_rate_per_night,bedrooms_count,city,date_of_listing,description,latitude,longitude,title,url
0,0,1,$27,2,Humble,May 2016,Welcome to stay in private room with queen bed...,30.020138,-95.293996,2 Private rooms/bathroom 10min from IAH airport,https://www.airbnb.com/rooms/18520444?location...


In [8]:
d["description"]

0    Welcome to stay in private room with queen bed...
Name: description, dtype: object

In [9]:
s = d["description"][0]

In [10]:
s

'Welcome to stay in private room with queen bed and detached private bathroom on the second floor. Another private bedroom with sofa bed is available for additional guests. 10$ for an additional guest.\\n10min from IAH airport\\nAirport pick-up/drop off is available for $10/trip.'

In [11]:
col_names = ["average_rate_per_night","bedrooms_count", "city", "date_of_listing", "description", "latitude", "longitude","title","url"]

# Step 3

## Preprocessing

In [12]:
def stopWords(column_file):
    
    stopWords = set(stopwords.words('english'))
    words = word_tokenize(str(column_file))
    wordsFiltered = []

    for w in words:
        if w not in stopWords:
            wordsFiltered.append(w)
    return wordsFiltered
    

In [13]:
def punctuation(list):
    exclude = set(string.punctuation)
    for el in list:
        if el in exclude:
            list.remove(el)
    return list

In [14]:
def stemming(list):
    ps = PorterStemmer()
    output = []
    for word in list:
        stem_word = ps.stem(word)
        output.append(stem_word)
    return output
#it returns a list (of the current column) with all the stem-words


In [15]:
#example of application
s = stopWords(d["title"][0])
p = punctuation(s)
print(stemming(p))

['2', 'privat', 'rooms/bathroom', '10min', 'iah', 'airport']


In [16]:
#On ALL the COLUMNS
#applyng stopWords,punctuation and stemming functions for every column of the dataframe
#it returns a dictionary, where keys are the columns names, and for every key there is the list of cleaned words.
def all_col(read_file):
    dic = {}
    for name in col_names:
        s = str(read_file[name][0]).replace("\\n", " ")
        l = stopWords(s)
        l = punctuation(l)
        result = stemming(l)
        dic[name] = result
        #print(result)
    return dic

In [17]:
print(all_col(d))


{'average_rate_per_night': ['27'], 'bedrooms_count': ['2'], 'city': ['humbl'], 'date_of_listing': ['may', '2016'], 'description': ['welcom', 'stay', 'privat', 'room', 'queen', 'bed', 'detach', 'privat', 'bathroom', 'second', 'floor', 'anoth', 'privat', 'bedroom', 'sofa', 'bed', 'avail', 'addit', 'guest', '10', 'addit', 'guest', '10min', 'iah', 'airport', 'airport', 'pick-up/drop', 'avail', '10/trip'], 'latitude': ['30.0201379199512'], 'longitude': ['-95.29399600425128'], 'title': ['2', 'privat', 'rooms/bathroom', '10min', 'iah', 'airport'], 'url': ['http', '//www.airbnb.com/rooms/18520444', 'location=cleveland', '2C', '20tx']}


## 3.1 Conjunctive query

In [18]:
#we have to consider just description and title columns


def filter_keys(dictionary):
    keys = ["description", "title"]
    return {x: dictionary[x] for x in dictionary if x in keys}



### 3.1.1 Create your index

In [None]:
#in future we can move this two blocks of code in an othe file, and upload them when we need
#for now, for convention we leave them here

In [19]:
#for every tsv file, I apply the preprocess, I filter the columns that I need, and then I build the vocabulary
vocabulary = {}
for i in range(len(rentals_file)):
    name = "files/doc_" + str(i) +".tsv"
    df = read(name)
    clean_d = all_col(df)
    filtered_d = filter_keys(clean_d)
    values = filtered_d.values()
    for l in values:
        for el in l:
            if el not in vocabulary.keys():
                vocabulary[el] = [name]
            else:
                vocabulary[el].append(name)
    
    
#it has got as keys the words, as values all the docs where words are in 
    

In [20]:
keys = vocabulary.keys()
list_keys = list(keys)
file_voc = {}

for i in range(len(list_keys)):
    term = "term_id_" + str(i)
    
    file_voc[term] = list_keys[i]

#it has as keys the id terms, as values, the corrispondent words

In [21]:
file_voc_keys = list(file_voc.keys())

In [22]:
#this has as keys the term id, as values the docs where id's word is in
new = {}
for i in range(len(file_voc_keys)):
    new_key = file_voc_keys[i]
    old_key = file_voc[new_key]
    new[new_key] = vocabulary[old_key]

In [None]:
#it's the same working on vocabulary or file_voc dictionary. The words in the first are picked just one time.

In [23]:
len(vocabulary.keys())

18383

In [24]:
len(file_voc.keys())

18383

### 3.1.2 Execute the query

In [38]:
query = input()

with a garden and near to the airport


In [39]:
query = stopWords(query)

In [40]:
query = punctuation(query)

In [41]:
query = stemming(query)

In [42]:
query

['garden', 'near', 'airport']

In [43]:
docs = {}
for word in query:
    try:
        l = vocabulary[word]
        docs[word] = l
    except:
        pass

In [44]:
v = docs.values()

In [45]:
v = list(v)

In [46]:
intersection = set(v[0]).intersection(*v)

In [47]:
from IPython.display import display, HTML

for file in intersection:
    file = read(file)
    
    result = file[["title","description","city","url"]]
    display(result.style)
    #print(result.to_html())

Unnamed: 0,title,description,city,url
0,Peaceful garage apt. close in!,"Quiet street, neat neighborhood near bus/train service to downtown, hospitals, conventions, sports. Equidistant to both airports. \nView onto tranquil garden area. Full kitchen and basic cable.\n\nThe light rail is 1/2 block from the apartment, 3 minute walk. \n We are at the LINDALE PARK stop.\nSee this link for a map of the lines, present and proposed: \n",Houston,https://www.airbnb.com/rooms/5781472?location=Atascocita%2C%20TX


Unnamed: 0,title,description,city,url
0,BREATHE DEEPLY A Cozy Austin Cabin,"A stone throw south of the Austin City Limits, this cabin is situated in 2.5 wooded acres. Luxurious and quiet. It is inhabited by deer, raccoon, rabbits, owls, lizards and butterflies. There is an old stone well filled by an aquifer (legend states the well was built by Ben McCullough; the civil war hero). A large back yard is used for walks, bonfires and golfing/batting balls. Lounge in the dappled sun on the patio near a running pond surrounded with gardens. Prepare a leisurely bar-b-que. Talk late into the night by the fire pit. \n\nThis 100+ yr old cabin has been revived and furnished with love. Time seems to stop while soaking in the deep old claw tub filled with endless hot spring water . Drifting to sleep, become aware of crackling in the wood burning stove, a train whistle and the trickling pond. Breakfast choices are street tacos or waffles...with pecans? \n\nIts possible you'll be tempted to stay all day. However, the center of Austin or San Marcos, with music, swimming, fishing and kayaking are just 15-25 minutes from this retreat. \nSound like a fit for you? More info:\nCircuit of the Americas is 18 miles through country roads. It is possible to attend this event and never enter Austin. A toll road will deliver you from the airport to the cabin and the event without the Austin traffic. Also close and south is an after race restaurant popular with race participants. \n\nNote for SXSW and other major events: Consider being in the crowds day and night then retreating by the fire at your quiet abode just 15 minutes south from all the ruckus of Central Austin. Invite your friends to hang with you and perhaps make your own music. Guests under 21 welcomed. \n\n\nIncluded in price:\nheating with wood burner &/or electric heater. Host can prep the fire. Firewood complementary AC for the summer/fall plus ceiling fan\nNO TV\nwireless internet - fast\nwasher/dryer available\nclaw tub with shower\nkitchen equipped with small refrigerator, toaster oven, small microwave, coffee maker\nlarge BBQ pit available\nof course all towels, quality cotton sheets, feathered quilt, pillows, utensils, dishes, cups, coffee, teas and toiletries are ready for you cleaning when requested. \nup to 3 parking spaces\n3 fire pits to lounge by\nprivate patio\n\nExtra charges:\ntransportation: Round trip transportation to the airport or elsewhere offered by host\npublic bus stop 3 miles from the cabin, it is easy to grab a ride to the bus stop from the host- no charge.\nweekend reservations are a minimum of 2 nights. If you can only stay 1 night, please ask about an exception so that I can attempt to make it work.\n\nGuest have exclusive access to the patio. You are welcome to wander all the grounds with the exception of course, of the private residence.\nComing soon: a sauna is being converted into a cedar silo sauna.\nFormula 1 rate includes stocked local wines and beer. Toll road to F1 is less that 2 miles from the cabin and provides a direct shot to the venue, without having to deal with Austin traffic. \nWEDDING PACKAGE: Evening before and night of wedding. This facilitates hosting out of town guests the night before and the day of. (note weekends are a two night minimum. this generally is not practical for the newly weds.) However, the events prior to the big event serves as an informal staging area and a place for your guests who need a meet up, changing and rest area prior to the wedding. Before the bride and groom return from the wedding, host will tidy the cabin and light a fire and candles for their special arrival.\n\nI interact with guests as much as they want. Usually, an initial orientation to the cabin and grounds, then texting for the morning \",Manchaca,https://www.airbnb.com/rooms/728502?location=Colorado%20River%2C%20TX


Unnamed: 0,title,description,city,url
0,Master Suite Gem Dallas ~ home access,"Master Suite w/bath, walkin closet, cafe table. 3 mi.from White Rock Lake, 4 mi.from Dallas Arboretum & Botanical Gardens w/Concerts, close to Casa Linda restaurants, shops & movies, many things 2 do. Downtown (9 mi). Elite Northpark mall and Galleria mall near by: 6/8 miles away.\nAirprt: DFW/LOVE. Ask about airport rides!!\nAmenities: laundry room, kitchen & backyard oasis!!",Dallas,https://www.airbnb.com/rooms/11010416?location=Balch%20Springs%2C%20TX


Unnamed: 0,title,description,city,url
0,Master Suite Gem Dallas ~ home access,"Master Suite w/bath, walkin closet, cafe table. 3 mi.from White Rock Lake, 4 mi.from Dallas Arboretum & Botanical Gardens w/Concerts, close to Casa Linda restaurants, shops & movies, many things 2 do. Downtown (9 mi). Elite Northpark mall and Galleria mall near by: 6/8 miles away.\nAirprt: DFW/LOVE. Ask about airport rides!!\nAmenities: laundry room, kitchen & backyard oasis!!",Dallas,https://www.airbnb.com/rooms/11010416?location=Arlington%2C%20TX


Unnamed: 0,title,description,city,url
0,Spacious two bedroom apt @ 360North,"Clean & Comfortable two bedroom apt is great for your stay in Grand Prairie, Tx! Located only 20 minutes from downtown Dallas. This unit features high vaulted ceilings, new steel appliances, & more available for use by my guests. My apartment is about 15 minutes from DFW airport & The Parks Mall in Arlington. About 5 minutes away from Six Flags over Texas, AT&T stadium, Globe Life stadium (Texas Rangers) and Restaurants such as BJ's, TGI Fridays, Olive Garden etc. I don't have a private parking spot, but there is always plenty of open spaces near my apt.",Grand Prairie,https://www.airbnb.com/rooms/17655499?location=Cedar%20Hill%2C%20TX


Unnamed: 0,title,description,city,url
0,SURROUND YOURSELF WITH CHARM,"Clean & Comfortable two bedroom apt is great for your stay in Grand Prairie, TX! Located only 15 minutes from downtown Dallas. My apartment is about 10 minutes from DFW airport & The Parks Mall in Arlington. About 15 minutes away from Six Flags over Texas, AT&T stadium, Globe Life stadium (Texas Rangers) and Restaurants such as BJ's, TGI Fridays, Olive Garden etc. I don't have a private parking spot, but there is always plenty of open spaces near my apt.",Grand Prairie,https://www.airbnb.com/rooms/18807470?location=Cedar%20Hill%2C%20TX


Unnamed: 0,title,description,city,url
0,1000 sq ft 1bed 1ba near DFW airport/Six Flags,"Depending on your type of stay, I can accommodate for what you are needing. Its on the third floor with only stair access, garden tub, and upgraded appliances. Super close to Six Flags, DFW Airport, and 30mins from Downtown Dallas",Grand Prairie,https://www.airbnb.com/rooms/19214169?location=Cedar%20Hill%2C%20TX


Unnamed: 0,title,description,city,url
0,Family Home near Austin Airport,"Spacious home with wood floors, and fire place. Kitchen has full available appliances and living area has recliner, LED TV, local channels, & Wifi all over the house. Master bedroom has King size bed, double sink, separate shower and a garden tub.",Austin,https://www.airbnb.com/rooms/8191636?location=Bastrop%20County%2C%20TX


Unnamed: 0,title,description,city,url
0,BREATHE DEEPLY A Cozy Austin Cabin,"A stone throw south of the Austin City Limits, this cabin is situated in 2.5 wooded acres. Luxurious and quiet. It is inhabited by deer, raccoon, rabbits, owls, lizards and butterflies. There is an old stone well filled by an aquifer (legend states the well was built by Ben McCullough; the civil war hero). A large back yard is used for walks, bonfires and golfing/batting balls. Lounge in the dappled sun on the patio near a running pond surrounded with gardens. Prepare a leisurely bar-b-que. Talk late into the night by the fire pit. \n\nThis 100+ yr old cabin has been revived and furnished with love. Time seems to stop while soaking in the deep old claw tub filled with endless hot spring water . Drifting to sleep, become aware of crackling in the wood burning stove, a train whistle and the trickling pond. Breakfast choices are street tacos or waffles...with pecans? \n\nIts possible you'll be tempted to stay all day. However, the center of Austin or San Marcos, with music, swimming, fishing and kayaking are just 15-25 minutes from this retreat. \nSound like a fit for you? More info:\nCircuit of the Americas is 18 miles through country roads. It is possible to attend this event and never enter Austin. A toll road will deliver you from the airport to the cabin and the event without the Austin traffic. Also close and south is an after race restaurant popular with race participants. \n\nNote for SXSW and other major events: Consider being in the crowds day and night then retreating by the fire at your quiet abode just 15 minutes south from all the ruckus of Central Austin. Invite your friends to hang with you and perhaps make your own music. Guests under 21 welcomed. \n\n\nIncluded in price:\nheating with wood burner &/or electric heater. Host can prep the fire. Firewood complementary AC for the summer/fall plus ceiling fan\nNO TV\nwireless internet - fast\nwasher/dryer available\nclaw tub with shower\nkitchen equipped with small refrigerator, toaster oven, small microwave, coffee maker\nlarge BBQ pit available\nof course all towels, quality cotton sheets, feathered quilt, pillows, utensils, dishes, cups, coffee, teas and toiletries are ready for you cleaning when requested. \nup to 3 parking spaces\n3 fire pits to lounge by\nprivate patio\n\nExtra charges:\ntransportation: Round trip transportation to the airport or elsewhere offered by host\npublic bus stop 3 miles from the cabin, it is easy to grab a ride to the bus stop from the host- no charge.\nweekend reservations are a minimum of 2 nights. If you can only stay 1 night, please ask about an exception so that I can attempt to make it work.\n\nGuest have exclusive access to the patio. You are welcome to wander all the grounds with the exception of course, of the private residence.\nComing soon: a sauna is being converted into a cedar silo sauna.\nFormula 1 rate includes stocked local wines and beer. Toll road to F1 is less that 2 miles from the cabin and provides a direct shot to the venue, without having to deal with Austin traffic. \nWEDDING PACKAGE: Evening before and night of wedding. This facilitates hosting out of town guests the night before and the day of. (note weekends are a two night minimum. this generally is not practical for the newly weds.) However, the events prior to the big event serves as an informal staging area and a place for your guests who need a meet up, changing and rest area prior to the wedding. Before the bride and groom return from the wedding, host will tidy the cabin and light a fire and candles for their special arrival.\n\nI interact with guests as much as they want. Usually, an initial orientation to the cabin and grounds, then texting for the morning \",Manchaca,https://www.airbnb.com/rooms/728502?location=Buda%2C%20TX


Unnamed: 0,title,description,city,url
0,Home near Galveston Beach and other attractions,"Nice 4 Bedroom 2 Bath home in a great location close to freeways. \n5 min to Tanger Outlet Mall\n15 min from Galveston Beach/Schlitterbahn/Moody Gardens / Texas City Dike\n18 minutes to NASA\n23 min Kema Boardwalk\n35 min Houston Hobby Airport\n50 min to Houston (Minute Maid Park, NRG Stadium, Zoo and Musems\n60 min to George Bush Intercontinel Airport",Texas City,https://www.airbnb.com/rooms/18076465?location=Bayou%20Vista%2C%20TX


Unnamed: 0,title,description,city,url
0,Peaceful home near airport & downtown,Our home is filled with warmth from lots of natural light. The original wood floors and earthy decor create a peaceful environment for anyone. We have a big backyard & garden that is very kid and pet friendly as well as many toys to share in our boys room. The house is 5 minutes away from the airport as well as 10 minutes away from downtown. We are in the middle of San Antonio which is a great location for accessing all types of fun.,San Antonio,https://www.airbnb.com/rooms/19014109?location=Converse%2C%20TX


Unnamed: 0,title,description,city,url
0,1000 sq ft 1bed 1ba near DFW airport/Six Flags,"Depending on your type of stay, I can accommodate for what you are needing. Its on the third floor with only stair access, garden tub, and upgraded appliances. Super close to Six Flags, DFW Airport, and 30mins from Downtown Dallas",Grand Prairie,https://www.airbnb.com/rooms/19214169?location=Bedford%2C%20TX


Unnamed: 0,title,description,city,url
0,R & M Roadhouse,"Center point is conveniently located near: the Alamo, San Antonio River Walk; Schlitterbahn Waterpark; John Newcombe Tennis Ranch; historic Fredericksburg for local wines and the National Museum of the Pacific War; Camp Verde historic post office and general store; Coming King Sculpture prayer garden. \n\nNightlife: Live music at Gruene Hall, the oldest dance hall in Texas; Live music and dining at John T. Floore Country Store in Helotes, Tx.\n\nOne hour to San Antonio International Airport.",Center Point,https://www.airbnb.com/rooms/15966243?location=Center%20Point%2C%20TX


Unnamed: 0,title,description,city,url
0,Family Home near Austin Airport,"Spacious home with wood floors, and fire place. Kitchen has full available appliances and living area has recliner, LED TV, local channels, & Wifi all over the house. Master bedroom has King size bed, double sink, separate shower and a garden tub.",Austin,https://www.airbnb.com/rooms/8191636?location=Cedar%20Creek%2C%20TX


## 3.2 Conjunctive query & Ranking score