# Part 3 - Text analysis and ethics

# 3.a Computing PMI


In [None]:
import pandas as pd
from nltk.tag import pos_tag
import re
from collections import defaultdict,Counter
from nltk.stem import WordNetLemmatizer
from datetime import datetime
from tqdm import tqdm
import numpy as np
import os
tqdm.pandas()

In [None]:
# nltk imports, note that these outputs may be different if you are using colab or local jupyter notebooks
from nltk import *
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize

In [None]:
# load stopwords
sw = set(stopwords.words('english'))

In [None]:
p = ''#getwd
df = pd.read_csv(os.path.join(p,'reviews.csv'))
# deal with empty reviews
df.comments = df.comments.fillna('')

In [None]:
df.head()

In [None]:
df.shape

### 3.a1 - Process reviews


In [None]:
def process_reviews(df):
    content=df['comments']
    #df_nl=pd.DataFrame(columns=['tokenized','tagged','lower_tagged'])
    tokenized=[]
    tagged=[]
    lower_tagged=[]
    for sentence in content:
        #print(sentence)
        tokens=word_tokenize(str(sentence))
        tokenized.append(tokens)
        #print(tokens)

        tags=pos_tag(tokens)
        tagged.append([])
        lower_tagged.append([])
        for word,pos in tags:
            if pos[0] in ('N','J','V'):
                tagged[-1].append((word,pos))
                lower_tagged[-1].append((word.lower(),pos))

    df['tokenized']=tokenized
    df['tagged']=tagged
    df['lower_tagged']=lower_tagged
    return df

In [None]:
df = process_reviews(df)

In [None]:
df.head()

### 3.a2 - Create a vocabulary


In [None]:
def get_vocab(df):
  # your code here
    vocab=[word for low_tagged in df['lower_tagged'] for word in low_tagged]
    vocab_noun=[word for (word,tag) in vocab if tag.startswith("N")]
    vocab_noun_center=FreqDist(vocab_noun)
    cent_vocab=[word for (word,tag) in vocab_noun_center.most_common(1000)]
    vocab_av=[word for (word,tag) in vocab if tag.startswith("J") or tag.startswith("V")]
    vocab_av_center=FreqDist(vocab_av)
    cont_vocab=[word for (word,tag) in vocab_av_center.most_common(1000)]
    return cent_vocab, cont_vocab

In [None]:
cent_vocab, cont_vocab = get_vocab(df)

In [None]:
cent_vocab[1:20]

In [None]:
cont_vocab[1:20]

### 3.a3 Count co-occurrences between center and context words


In [None]:
def get_coocs(df, cent_vocab, cont_vocab):
  # your code here
    sentences=df['comments']
    dic=defaultdict(int)
    for sentence in sentences:
        tokens=word_tokenize(str(sentence))
        for token in tokens:
            if token in cent_vocab:
                if dic[token]==0:
                    dic_sub=defaultdict(int)
                else:
                    dic_sub=dic[token]
                for token_av in tokens:
                    if token_av in cont_vocab:
                        dic_sub[token_av]+=1
                dic[token]=dic_sub
    coocs=dic
    return coocs  

In [None]:
coocs = get_coocs(df, cent_vocab, cont_vocab)

In [None]:
coocs

### 3.a4 Convert co-occurrence dictionary to 1000x1000 dataframe

In [None]:
def cooc_dict2df(coocs):
  # your code here
    coocdf=pd.DataFrame(index=cent_vocab,columns=cont_vocab)
    for row,dic in coocs.items():
        for col,value in dic.items():
            df.at[row,col]=value
    coocdf.fillna(value=0,inplace=True)
    return coocdf

In [None]:
coocdf = cooc_dict2df(coocs)
coocdf.shape

### 3.a5 Raw co-occurrences to PMI scores


In [None]:
def cooc2pmi(df):
  # your code here
    s=sum(df.sum(0))
    print(s)
    df.applymap(lambda x:np.log(x/s))
    return pmidf

In [None]:
pmidf = cooc2pmi(coocdf)

### 3.a6 Retrieve top-k context words, given a center word


In [None]:
def topk(df, center_word, N=10):
    return sorted(df.loc[center_word],reverse=True)[0:N]

In [None]:
topk(pmidf, 'coffee')