# Imports

In [12]:
## load packages 
import pandas as pd
import re
import numpy as np

## nltk imports
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize

## uncomment and download if this is your first 
## time running 
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

## sentiment analysis
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

## specify to print all output in a call
## and not just first
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [13]:
## spacy --- if you get an error at the load step
## need to download en_core_web_sm (google)
import spacy
sp = spacy.load('en_core_web_sm')
import en_core_web_sm
nlp = en_core_web_sm.load()

# Load data 

##### if working from within the repo, can use this relative path
path_todata = "../public_data/airbnb_text/airbnb_text.csv"

## load data
ab = pd.read_csv(path_todata)
ab.head()
ab.info()

# Text mining

## Manual approach 1: look for a single word

In [21]:
## using the `name_upper` var, look at where reviews mention cozy
ab['is_cozy'] = np.where(ab.name_upper.str.contains("COZY"), True, False)

## find the mean price by neighborhood and whether cozy
mp = pd.DataFrame(ab.groupby(['is_cozy', 'neighbourhood_group'])['price'].mean())

## reshape to wide format so that each borough is row
## and one col is the mean price for listings that describe
## the place as cozy; other col is mean price for listings
## without that word
mp_wide = pd.pivot_table(mp, index = ['neighbourhood_group'],
                        columns = ['is_cozy'])

mp_wide.columns = ['no_mention_cozy', 'mention_cozy']

mp_wide

Unnamed: 0_level_0,no_mention_cozy,mention_cozy
neighbourhood_group,Unnamed: 1_level_1,Unnamed: 2_level_1
Bronx,89.231088,74.214286
Brooklyn,128.175441,91.130224
Manhattan,204.109775,129.91714
Queens,102.596682,80.344388
Staten Island,120.650307,74.319149


In [20]:
ab

Unnamed: 0,id,name,name_upper,neighbourhood_group,price,is_cozy
0,2539,Clean & quiet apt home by the park,CLEAN & QUIET APT HOME BY THE PARK,Brooklyn,149,False
1,2595,Skylit Midtown Castle,SKYLIT MIDTOWN CASTLE,Manhattan,225,False
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,THE VILLAGE OF HARLEM....NEW YORK !,Manhattan,150,False
3,3831,Cozy Entire Floor of Brownstone,COZY ENTIRE FLOOR OF BROWNSTONE,Brooklyn,89,True
4,5022,Entire Apt: Spacious Studio/Loft by central park,ENTIRE APT: SPACIOUS STUDIO/LOFT BY CENTRAL PARK,Manhattan,80,False
...,...,...,...,...,...,...
48890,36484665,Charming one bedroom - newly renovated rowhouse,CHARMING ONE BEDROOM - NEWLY RENOVATED ROWHOUSE,Brooklyn,70,False
48891,36485057,Affordable room in Bushwick/East Williamsburg,AFFORDABLE ROOM IN BUSHWICK/EAST WILLIAMSBURG,Brooklyn,40,False
48892,36485431,Sunny Studio at Historical Neighborhood,SUNNY STUDIO AT HISTORICAL NEIGHBORHOOD,Manhattan,115,False
48893,36485609,43rd St. Time Square-cozy single bed,43RD ST. TIME SQUARE-COZY SINGLE BED,Manhattan,55,True


## Manual approach 2: score based on dictionary of words

In [22]:
## construct dictionary
space_indicators = {'small': ['COZY', 'COMFY', 'LITTLE', 'SMALL'],
                   'large': ['SPACIOUS', 'LARGE', 'HUGE', 'GIANT']}


## for each listing, find the number of occurrences
## of words in each key

### first, let's test with one listing
practice_listing = "NICE AND COZY LITTLE APT AVAILABLE"

### splitting that string at space and looking at overlap with each key
### first, look at overlap with the list containing words for small
words_overlap_small = [word 
                    for word in practice_listing.split(" ") if 
                      word in space_indicators['small']]
words_overlap_small



['COZY', 'LITTLE']

In [23]:
### then, look at overlap with the list containing words for large
words_overlap_large = [word for word in practice_listing.split(" ") if 
                      word in space_indicators['large']]
words_overlap_large

### could then take length as a fraction of all words
len(words_overlap_small)/len(practice_listing.split(" "))
len(words_overlap_large)/len(practice_listing.split(" "))


[]

0.3333333333333333

0.0

## Part of speech tagging

In [29]:
## specify example
example_for_tag = "This is a chill apt next to the subway in LES Chinatown"
example_for_tag

'This is a chill apt next to the subway in LES Chinatown'

In [32]:
## try part of speech tagging using nltk
tokens = word_tokenize(example_for_tag) # Generate list of tokens, break apart the sentence into a list of tokens
tokens_pos = pos_tag(tokens) # generate part of speech tags for those tokens
 
## returns a list of tuples
## first element in tuple is a word
## second element in tuple is the part of speech
#for one_tok in tokens_pos:
 #   print(one_tok)
tokens_pos

LookupError: 
**********************************************************************
  Resource [93maveraged_perceptron_tagger[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('averaged_perceptron_tagger')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtaggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle[0m

  Searched in:
    - 'C:\\Users\\cabal/nltk_data'
    - 'C:\\Users\\cabal\\anaconda3\\nltk_data'
    - 'C:\\Users\\cabal\\anaconda3\\share\\nltk_data'
    - 'C:\\Users\\cabal\\anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\cabal\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [33]:
## use list iteration to extract proper nouns (NNP)
## i'm first checking if the second element in the tuple
## is equal to NNP
## if so, i'm returning the first element in the tuple (the 
## actual word)
all_prop_noun = [one_tok[0] for one_tok in tokens_pos 
                if one_tok[1] == "NNP"]
all_prop_noun

all_adj_noun = [one_tok[0] for one_tok in tokens_pos 
                if one_tok[1] == "JJ" or 
               one_tok[1] == "NN"]
all_adj_noun

NameError: name 'tokens_pos' is not defined

## Named Entity Recognition

In [49]:
## modified from a real tweet

## tweet
d_tweet = """We’ll be hosting on-campus COVID-19 booster clinics 
at Dartmouth College in New Hampshire from 9 a.m. to 6 p.m. on
Monday, Jan. 10, and Tuesday, Jan. 11, at
Alumni Hall in the Hopkins Center. For information on how to
register and additional winter updates, head to
"""


d_tweet_2 = """We’ll be hosting on-campus Pfizer COVID-19 booster clinics 
at Dartmouth College in NH from 9 a.m. to 6 p.m. on
Monday, Jan. 10, and Tuesday, Jan. 11, at
Alumni Hall in the Hopkins Center. For information on how to
register and additional winter updates, head to
"""


In [50]:
spacy_dtweet = nlp(d_tweet)
print(type(spacy_dtweet))

spacy_dtweet_2 = nlp(d_tweet_2)


<class 'spacy.tokens.doc.Doc'>


In [51]:
## try a couple variations
for one_tok in spacy_dtweet.ents:
    print("Entity: " + one_tok.text + "; NER tag: " + one_tok.label_)

Entity: COVID-19; NER tag: ORG
Entity: Dartmouth College; NER tag: ORG
Entity: New Hampshire; NER tag: GPE
Entity: 9 a.m. to 6 p.m.; NER tag: TIME
Entity: Monday, Jan. 10; NER tag: DATE
Entity: Tuesday, Jan. 11; NER tag: DATE
Entity: Alumni Hall; NER tag: FAC
Entity: the Hopkins Center; NER tag: FAC


### Challenge 1

Play around with different variations of the Dartmouth tweet and look at the results. For instance, try the following:

- What happens if you abbreviate New Hampshire to NH?
  
    *It will consider it to be an organization rather than a location*
- What happens if you add the word Pfizer before COVID-19?

    *It will consider it still as an organization*
  
- What entities seem misclassified?

  *The only entity I think is misclassified is covid 19, it seems to think it is an organization but in reality its a vacicine.* 

In [64]:
## try a couple variations
for one_tok in spacy_dtweet_2.ents:
    print("Entity: " + one_tok.text + "; NER tag: " + one_tok.label_ )

Entity: Pfizer COVID-19; NER tag: ORG
Entity: Dartmouth College; NER tag: ORG
Entity: NH; NER tag: ORG
Entity: 9 a.m. to 6 p.m.; NER tag: TIME
Entity: Monday, Jan. 10; NER tag: DATE
Entity: Tuesday, Jan. 11; NER tag: DATE
Entity: Alumni Hall; NER tag: FAC
Entity: the Hopkins Center; NER tag: FAC


### Challenge 2

How do we generalize from doing Named Entity Recognition one just one string to doing NER on a whole column that contains strings? By defining and executing a **function**, of course!

Using the following sample of strings from airbnb listing names, define a function that takes in one airbnb string and does the following:

- iterates over entities in that string (list comprehension would work well for this)
- checks if each label indicates a place
- returns the text of each original place/entity

Then execute the function on the example strings through iteration (again, list comprehension works well here).

In [81]:
## for runtime purposes, take a sample of the airbnb listing names
ab_name_examples = ab.name[10:15]
ab_name_examples

10                    Beautiful 1br on Upper West Side
11                     Central Manhattan/near Broadway
12      Lovely Room 1, Garden, Best Area, Legal rental
13    Wonderful Guest Bedroom in Manhattan for SINGLES
14                       West Village Nest - Superhost
Name: name, dtype: object

In [133]:
# your function code here

def place_finder(string):
    doc = nlp(string)
    pos = [entity.text for entity in doc.ents 
           if entity.label_ == "FAC" or entity.label_ == "GPE" 
           or entity.label_ == "LOC"]
    if len(pos) > 0:
        return pos[0]
    else:
        return "No Place"
   
[place_finder(place) for place in ab_name_examples]


['Upper West Side', 'Broadway', 'Garden', 'Manhattan', 'West Village']

## Sentiment analysis

### Using the default scorer on a few example phrases

In [148]:
## initialize a scorer wich will take negative words and positve words and then give us a grade of the positivty scale
sent_obj = SentimentIntensityAnalyzer()
print(type(sent_obj))
# ## score one listing
practice_listing = "NICE AND COZY LITTLE APT AVAILABLE"

sentiment_example = sent_obj.polarity_scores(practice_listing) # Polarity scale seems to be a scale of negative, neautral and positve words and gets a compound
sentiment_example

<class 'vaderSentiment.vaderSentiment.SentimentIntensityAnalyzer'>


{'neg': 0.0, 'neu': 0.641, 'pos': 0.359, 'compound': 0.4215}

In [139]:
## adding phrase with word terrible and score
practice_listing_2 = "NICE AND COZY LITTLE APT AVAILABLE. REALLY TERRIBLE VIEW."
sentiment_example_2 = sent_obj.polarity_scores(practice_listing_2)

# Score will be between -1 to 1 in compound. Good for looking at the tone of things
sentiment_example_2


{'neg': 0.257, 'neu': 0.531, 'pos': 0.212, 'compound': -0.1513}

In [140]:
## adding phrase about rats; bad but might not be in scoring dictionary
practice_listing_3 = "NICE AND COZY LITTLE APT AVAILABLE. HAS RATS THOUGH."
sentiment_example_3 = sent_obj.polarity_scores(practice_listing_3)
sentiment_example_3

{'neg': 0.0, 'neu': 0.741, 'pos': 0.259, 'compound': 0.4215}

In [None]:
## summarize all 3
print("String: " + practice_listing + " scored as:\n" + str(sentiment_example))
print("String: " + practice_listing_2 + " scored as:\n" + str(sentiment_example_2))
print("String: " + practice_listing_3 + " scored as:\n" + str(sentiment_example_3))


### Updating the dictionary with manually-added words

In [150]:
print(type(sent_obj.lexicon))

<class 'dict'>


In [151]:
## lexicon is a dictionary where the key
## is the word
## the value is the score (negative = negative)
## here, i'm benchmarking the negativity of the
## rodents to the negativity of the word aversion
sent_obj.lexicon['aversion']

-1.9

In [153]:
## create a dictionary with 
## negative scores for pests
pest_words = {
    'rat': -1.9,
    'rats': -1.9,
    'mice': -1.9,
    'mouse': -1.9,
    'roach': -1.9,
    'cockroach': -1.9
}


## initiate new sentiment object
## so that we don't alter old one
## use.update to add new words
new_si = SentimentIntensityAnalyzer()
new_si.lexicon.update(pest_words)

## try re-scoring the third example
## see negative
print("After lexicon update: " + practice_listing_3 + " scored as:\n" + \
      str(new_si.polarity_scores(practice_listing_3)))

After lexicon update: NICE AND COZY LITTLE APT AVAILABLE. HAS RATS THOUGH. scored as:
{'neg': 0.228, 'neu': 0.551, 'pos': 0.22, 'compound': -0.0258}
