# Part 3 - Text analysis and ethics




Computing PMI

In this assessment you are tasked to discover strong associations between concepts in Airbnb reviews. The starter code we provide in this notebook is for orientation only. The below imports are enough to implement a valid answer.

### Imports, data loading and helper functions

In [1]:
import pandas as pd
from nltk.tag import pos_tag
import re
from collections import defaultdict,Counter
from nltk.stem import WordNetLemmatizer
from datetime import datetime
from tqdm import tqdm
import numpy as np
import os
tqdm.pandas()

from nltk.corpus import stopwords
import string
from unicodedata import category

In [None]:
# nltk imports, note that these outputs may be different if you are using colab or local jupyter notebooks
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize

In [3]:
# load stopwords
sw = set(stopwords.words('english'))

In [4]:
# if you use Google Colab:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
p = '/content/drive/MyDrive/coursework'
df = pd.read_csv(os.path.join(p,'reviews.csv'))
# deal with empty reviews
df.comments = df.comments.fillna('')

In [7]:
df.head(2)

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,2818,1191,2009-03-30,10952,Lam,Daniel is really cool. The place was nice and ...
1,2818,1771,2009-04-24,12798,Alice,Daniel is the most amazing host! His place is ...


In [8]:
df.shape

(452143, 6)

### STEP1 - Process reviews

In [9]:
def process_reviews(df):
  """
  function perfroms tokenizaion, tagging and lower_tagging on the comments 
  
  args:df: reviews dataset
  
  returns: new data frame with three new columns tokenized,tagged and lower_tagged

  """
  
  df['tokenized']=df['comments'].apply(word_tokenize)
  df['tagged']= df['tokenized'].apply(pos_tag)
  df['lower_tagged']=df['tokenized'].apply(lambda x: [word.lower() for word in x]).apply(pos_tag)
  return df

In [10]:
#calling the function 
df = process_reviews(df)
df.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,tokenized,tagged,lower_tagged
0,2818,1191,2009-03-30,10952,Lam,Daniel is really cool. The place was nice and ...,"[Daniel, is, really, cool, ., The, place, was,...","[(Daniel, NNP), (is, VBZ), (really, RB), (cool...","[(daniel, NN), (is, VBZ), (really, RB), (cool,..."
1,2818,1771,2009-04-24,12798,Alice,Daniel is the most amazing host! His place is ...,"[Daniel, is, the, most, amazing, host, !, His,...","[(Daniel, NNP), (is, VBZ), (the, DT), (most, R...","[(daniel, NN), (is, VBZ), (the, DT), (most, RB..."
2,2818,1989,2009-05-03,11869,Natalja,We had such a great time in Amsterdam. Daniel ...,"[We, had, such, a, great, time, in, Amsterdam,...","[(We, PRP), (had, VBD), (such, JJ), (a, DT), (...","[(we, PRP), (had, VBD), (such, JJ), (a, DT), (..."
3,2818,2797,2009-05-18,14064,Enrique,Very professional operation. Room is very clea...,"[Very, professional, operation, ., Room, is, v...","[(Very, RB), (professional, JJ), (operation, N...","[(very, RB), (professional, JJ), (operation, N..."
4,2818,3151,2009-05-25,17977,Sherwin,Daniel is highly recommended. He provided all...,"[Daniel, is, highly, recommended, ., He, provi...","[(Daniel, NNP), (is, VBZ), (highly, RB), (reco...","[(daniel, NN), (is, VBZ), (highly, RB), (recom..."


### STEP2- Create a vocabulary

In [11]:
def get_vocab(df):
 """
 Creates the vocabulary of center and context words 
  args:
      reviews dataset 
  Returns: 
       Two lists cont_vocab, cent_vocab -which contains most common nouns,verbs or adjective 
 """ 
 lower=[]
 for row in df['lower_tagged']:                    #Iterating rows in the lower_tagged column
   for word,pos in row:
      tagged_word = (word, pos)
      lower.append(tagged_word)
 center=[word for(word,pos) in lower if pos[:1]=='N']  # select words that are nouns 
 context=[word for(word,pos) in lower if pos[:1]=='V' or pos[:1]=='J'] #select words that verbs/adjectives
 freq_noun=Counter(center)
 center_words=freq_noun.most_common(1000)            # select most common nouns 
 freq_verb=Counter(context)
 context_words=freq_verb.most_common(1000)           # select most common verbs/adjectives 
 cent_vocab=[word for (word,val) in center_words]
 cont_vocab=[word for (word,val) in context_words]
 return cent_vocab, cont_vocab


In [21]:
cent_vocab, cont_vocab= get_vocab(df)
get_vocab(df)

(['place',
  'apartment',
  'location',
  'amsterdam',
  'i',
  'host',
  'stay',
  'everything',
  'city',
  'room',
  'time',
  'house',
  'area',
  'home',
  'center',
  'très',
  'restaurants',
  '’',
  'tram',
  'centre',
  'station',
  'minutes',
  'walk',
  'bed',
  'neighborhood',
  'space',
  'experience',
  'thanks',
  'hosts',
  'à',
  'thank',
  'bien',
  'la',
  'communication',
  'day',
  'distance',
  'kind',
  'y',
  'bathroom',
  'days',
  'kitchen',
  'trip',
  'et',
  'lot',
  'night',
  'breakfast',
  'sehr',
  'arrival',
  'e',
  'people',
  'airbnb',
  'places',
  'tips',
  'bus',
  'street',
  'lots',
  'boat',
  'ist',
  's',
  'der',
  'appartement',
  'min',
  'coffee',
  'shops',
  'muy',
  'war',
  'stairs',
  'view',
  'bit',
  'transport',
  'que',
  'minute',
  'bars',
  'die',
  'check',
  'neighbourhood',
  'questions',
  'way',
  'family',
  'studio',
  'anyone',
  'things',
  'es',
  'access',
  'dans',
  'mit',
  'man',
  't',
  'le',
  'bike',
  'we

### STEP 3 Count co-occurrences between center and context words

In [22]:
def window_cooc(lst, coocs, cent_vocab, cont_vocab, window_size=5):
  """
  Creates a dictionary of dictionaries from center and context words 

  args:
      lst:List of tokens 
      coocs: co-occurrence matrix of centre and context words
      cent_vocab: list of center words
      cont_vocab: list of context words
      windows_size: size of the windows to search co-occurence
  Returns: 
      coocs:Count of co-occurence of centre and context words 
  """
  length = len(lst)
  # iterate through tokens in the lst
  for i, word in enumerate(lst):
    if word in cent_vocab:
      # when we found a center word, iterate through neighbours in radius 
      # of windows size (for example 5 words before and 5 words after)
      for j in range(max(i - window_size, 0), min(i + window_size, length)):
        # find context words in the window ignoring the center word
        if lst[j] in cont_vocab and j != i:
          # add a co-occurence case in the coocs dictionary
          coocs[word][lst[j]] = coocs[word].get(lst[j], 0) + 1


def get_coocs(df, cent_vocab, cont_vocab):

  coocs = {}
  # create an empty dictionary structure for a co-occurence matrix
  for word in cent_vocab:
    coocs[word] = {}
    for cont in cont_vocab:
      coocs[word][cont] = 0
  # iterate through the 'tokenized' column
  for row in df.tokenized:
    # make a list of lowered tokens
    lst = [word.lower() for word in row]
    # process the row and add all co-occurence case into the coocs dictionary
    window_cooc(row, coocs, cent_vocab, cont_vocab)
  return coocs  

In [25]:
coocs = get_coocs(df, cent_vocab, cont_vocab)
list(coocs)[:2]

['place', 'apartment']

### STEP 4 Convert co-occurrence dictionary to 1000x1000 dataframe


In [26]:
def cooc_dict2df(coocs):
  """
     Function takes co-occurrence matrix from previous function and converts to a dataframe
     
     Args:
         coocs: co-occurrence matrix
     Returns:
          1000*1000 dataframe with rows and columns as centre,contex words 
  """
  coocdf = pd.DataFrame(coocs).fillna(0)
  return coocdf

In [27]:
coocdf = cooc_dict2df(coocs)
coocdf.head()

Unnamed: 0,place,apartment,location,amsterdam,i,host,stay,everything,city,room,...,peace,все,petits,heartbeat,découvrir,edwin,quaint,walls,tub,perfecto
was,30764,35994,21164,376,872,37770,20085,13312,5990,17858,...,38,0,0,24,0,0,275,57,139,0
is,44540,51985,36565,568,474,30776,9827,9839,10461,18270,...,65,0,0,14,0,4,379,79,136,2
great,19706,14837,29575,256,222,23092,17109,2930,2968,3669,...,15,0,0,4,0,0,45,16,68,0
nice,11634,10879,6376,173,191,7333,7040,1794,1372,5328,...,10,0,0,0,0,2,39,11,57,0
had,1969,3099,1107,192,471,2094,17365,8026,918,1916,...,13,0,0,2,0,0,22,12,18,0


In [28]:
coocdf.shape

(1000, 1000)

### STEP 5 Raw co-occurrences to PMI scores

What to implement: A function `cooc2pmi(df)` that takes as input the DataFrame generated in step 4, and returns a new DataFrame with the same rows and columns, but with PMI scores instead of raw co-occurrence counts. 

In [29]:
def cooc2pmi(df):
  """
  args: 
     df: dataframe of co-occurrence dictionary from previous stage 
  return: dataframe 1000x1000 of center and context words with their
    pointwise mutual information 
  """
  # make an empty dataframe with the same structure as df
  pmidf = pd.DataFrame().reindex_like(df)
  # a sum of all rows and columns
  n = coocdf.sum().sum()
  # series with sums of rows and columns
  x_sum = coocdf.sum()
  y_sum = coocdf.sum(axis=1)
  # iterate through column names (center words) and row indexes (context words)
  for col in df.columns:
    for row in df.index:
      # calculate PMI
      p_i = x_sum[col] / n
      p_j = y_sum[row] / n
      p_ij = df[col][row]
      # to avoid division by zero or getting minus infinity, we will consider
      # pmi in such cases as -1
      if p_i == 0 or p_j == 0 or p_ij == 0:
        pmidf[col][row] = -1
      else:
        pmidf[col][row] = np.log10(p_ij / (p_i * p_j))
  return pmidf

In [30]:
pmidf = cooc2pmi(coocdf)
pmidf.head()

Unnamed: 0,place,apartment,location,amsterdam,i,host,stay,everything,city,room,...,peace,все,petits,heartbeat,découvrir,edwin,quaint,walls,tub,perfecto
was,7.380041,7.466215,7.345248,6.988108,7.123727,7.669588,7.299826,7.393922,7.058714,7.527057,...,6.831484,-1.0,-1.0,6.859608,-1.0,-1.0,7.460358,7.011211,7.365401,-1.0
is,7.499662,7.584776,7.541629,7.126182,6.817902,7.539567,6.948288,7.221543,7.259773,7.495876,...,7.023527,-1.0,-1.0,6.584438,-1.0,7.482965,7.558577,7.111877,7.314838,5.517568
great,7.569792,7.464526,7.87377,7.204355,6.912759,7.839098,7.613374,7.119742,7.136946,7.222967,...,6.810987,-1.0,-1.0,6.464652,-1.0,-1.0,7.057433,6.842652,7.438091,-1.0
nice,7.551398,7.540245,7.417869,7.244637,7.057915,7.551394,7.438197,7.117172,7.012312,7.595459,...,6.845371,-1.0,-1.0,-1.0,-1.0,7.816692,7.205761,6.8904,7.571932,-1.0
had,6.871689,7.086652,6.749243,7.381667,7.541677,7.098863,7.922074,7.859623,6.929575,7.243065,...,7.051089,-1.0,-1.0,6.465872,-1.0,-1.0,7.048893,7.019963,7.163104,-1.0


In [31]:
pmidf.shape

(1000, 1000)

### 3.a6 Retrieve top-k context words, given a center word

What to implement: A function `topk(df, center_word, N=10)` that takes as input: (1) the DataFrame generated in step 5, (2) a `center_word` (a string like `‘towels’`), and (3) an optional named argument called `N` with default value of 10; and returns a list of `N` strings, in order of their PMI score with the `center_word`. You do not need to handle cases for which the word `center_word` is not found in `df`. 

In [None]:
def topk(df, center_word, N=10):
  """Get top N context words for a centerword from the PMI dataframe"""
  top_words = df[center_word].sort_values(ascending=False).index[:N].to_list()
  return top_words

In [None]:
topk(pmidf, 'coffee')

['nespresso',
 'complimentary',
 'supplied',
 'drink',
 'delicious',
 'including',
 'stocked',
 'fresh',
 'breakfast',
 'tasty']