In [3]:
import pandas as pd
import spacy
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

In [None]:
nlp = spacy.load('en_core_web_sm')
allowed_specials = ['&', ' ', '`']
path_to_file = "../data/labeled_data_1000rows.csv"
path_to_old_file = "../data/scrpaing_result_raw.csv"


In [None]:
df = pd.read_csv(path_to_file, encoding='utf8', sep='|', nrows=628, header=1, names=['text', 'entities'])
df.astype('string')
df = df.fillna('')
df['entities'] = df['entities'].replace('restaurants', '')
df.head()


### Reformat file
Split rows with multiple sentences into separate rows (explode data). It might yield better results than long paragraphs.

In [None]:
pd.concat([pd.Series(row['category'], row['text'].split('. ')) for _, row in df.iterrows()])\
    .reset_index().to_csv('../data/labeled_data_1000rows.csv', sep='|', index=False)

Read reformatted scraped dataset.

In [None]:
df['nlp'] = df['text'].apply(nlp)
df.head()

### NLTK Porter Stemmer

In [None]:
porter_stemmer = PorterStemmer()
df['porter_stemms'] = df['nlp'].apply(lambda x: [porter_stemmer.stem(word.text) for word in x])


### NLTK Snowball Stemmer

In [None]:
snowball_stemmer = SnowballStemmer(language='english')
df['snowball_stemms'] = df['nlp'].apply(lambda x: [snowball_stemmer.stem(word.text) for word in x])

### SpaCy Lemmatizer

In [None]:
df['spacy_lemmas'] = df['nlp'].apply(lambda x: [word.lemma_ for word in x])

In [None]:
df.head(n=10)

# Topic Modelling

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
topic_count = 4
max_df = 0.9
min_df = 2

In [None]:
cv = CountVectorizer(max_df=max_df, min_df=min_df, stop_words='english')
dtm = cv.fit_transform(df['text'])
dtm

In [None]:
lda = LatentDirichletAllocation(n_components=topic_count, random_state=2137)
lda.fit_transform(dtm)

Get 10 most important words for topics

In [None]:
for index, topic in enumerate(lda.components_):
    print(f"Topic {index}")
    for i in topic.argsort()[-10:]:
        print(cv.get_feature_names()[i])

Get probaility of line belonging to a topic

In [None]:
topic_result = lda.transform(dtm)
topic_result[0].round(2)

In [None]:
df['topic'] = topic_result.argmax(axis=1)
df.head(n=20)

In [None]:
for topic_index in range(topic_count):
    topic_row_count = df[df['topic'] == topic_index].count()['text']
    entity_row_count = df[(df['entities'] != '') & (df['topic'] == topic_index)].count()['entities']
    print(f"Topic {topic_index}: {entity_row_count}/{topic_row_count}")


None of the found topics cover significant majority of rows containing entity.

In [None]:
for topic_count in [2]:
    for max_df in [0.6, 0.7, 0.75, 0.8, 0.85, 0.9]:
        for min_df in [1, 2, 3, 4]:
            print(f"topic_count:{topic_count} max_df:{max_df} min_df:{min_df}")
            cv = CountVectorizer(max_df=max_df, min_df=min_df, stop_words='english')
            dtm = cv.fit_transform(df['text'])
            lda = LatentDirichletAllocation(n_components=topic_count, random_state=2137)
            lda.fit_transform(dtm)
            topic_result = lda.transform(dtm)
            df['topic'] = topic_result.argmax(axis=1)
            for topic_index in range(topic_count):
                topic_row_count = df[df['topic'] == topic_index].count()['text']
                entity_row_count = df[(df['entities'] != '') & (df['topic'] == topic_index)].count()['entities']
                print(f"Topic {topic_index}: {entity_row_count}/{topic_row_count}")


Use whitelist and blacklsit to select relevant sentences.

In [None]:
whitelist = [' at ', 'in the', 'crew', 'restaurant', 'residency', 'resident', 'concept', 'launch', 'open', 'chef',
             'acclaimed', 'partnership', 'place', 'priced', 'pricey', 'cafe', 'bar ', 'spot', 'location', 'instagram',
             'affordable', 'food heaven', 'list', 'highlight', 'visit', 'by the', 'I love', 'At ', 'discover']


Use lists on raw text.

In [None]:
TP, TN, FP, FN = 0, 0, 0, 0
for index, row in df.iterrows():
    positive = False
    for word in whitelist:
        if word in row['text'] and row['entities'] != '':
            TP += 1
            positive = True
            break
        if word in row['text'] and row['entities'] == '':
            FP += 1
            positive = True
            break
    if not positive and row['entities'] == '':
        TN += 1
    if not positive and row['entities'] != '':
        FN += 1
        print(row['text'])
print(TP, TN, FP, FN)
array = [[TP, TN],
        [FP, FN]]
plot = make_confusion_matrix(np.array(array),
                      group_names=labels,
                      categories=categories,
                      filename='../plots/raw_text_matrix_prefilter.png')

Use lists on porter stemms.

In [None]:
whitelist_porter_stemms = [porter_stemmer.stem(word).strip() for word in whitelist]
print(whitelist_porter_stemms)
TP, TN, FP, FN = 0, 0, 0, 0
for index, row in df.iterrows():
    positive = False
    for word in whitelist_porter_stemms:
        if word in row['porter_stemms'] and row['entities'] != '':
            TP += 1
            positive = True
            break
        if word in row['porter_stemms'] and row['entities'] == '':
            FP += 1
            positive = True
            break
    if not positive and row['entities'] == '':
        TN += 1
    if not positive and row['entities'] != '':
        FN += 1
        print(row['porter_stemms'])
print(TP, TN, FP, FN)
array = [[TP, TN],
        [FP, FN]]
plot = make_confusion_matrix(np.array(array),
                      group_names=labels,
                      categories=categories,
                      filename='../plots/porter_matrix_prefilter.png')

Use lists on snowball stemms.

In [None]:
whitelist_snowball_stemms = [porter_stemmer.stem(word).strip() for word in whitelist]
print(whitelist_snowball_stemms)
TP, TN, FP, FN = 0, 0, 0, 0
for index, row in df.iterrows():
    positive = False
    for word in whitelist_snowball_stemms:
        if word in row['snowball_stemms'] and row['entities'] != '':
            TP += 1
            positive = True
            break
        if word in row['snowball_stemms'] and row['entities'] == '':
            FP += 1
            positive = True
            break
    if not positive and row['entities'] == '':
        TN += 1
    if not positive and row['entities'] != '':
        FN += 1
        print(row['snowball_stemms'])
print(TP, TN, FP, FN)
array = [[TP, TN],
        [FP, FN]]
plot = make_confusion_matrix(np.array(array),
                      group_names=labels,
                      categories=categories,
                      filename='../plots/snowball_matrix_prefilter.png')


Use regexps and other structural characteristics.

In [None]:
def is_relevant(word_list):
    word_list = word_list[1:]
    if any([word[0].isupper() for word in word_list if len(word) > 2]):
        return True
    return False

TP, TN, FP, FN = 0, 0, 0, 0
for index, row in df.iterrows():
    if is_relevant(row['spacy_lemmas']) and row['entities'] != '':
        TP += 1
    elif is_relevant(row['spacy_lemmas']) and row['entities'] == '':
        FP += 1
    elif not is_relevant(row['spacy_lemmas']) and row['entities'] == '':
        TN += 1
    elif not is_relevant(row['spacy_lemmas']) and row['entities'] != '':
        FN += 1
print(TP, TN, FP, FN)
array = [[TP, TN],
        [FP, FN]]
plot = make_confusion_matrix(np.array(array),
                      group_names=labels,
                      categories=categories,
                      filename='../plots/rules_matrix_prefilter.png')

Use pretrained NER to prefilter.

In [None]:
def is_relevant(row):
    return bool({ent.label_ for ent in row['nlp'].ents}.intersection({'DATE' ,'ORG', 'PERSON', 'NORP', 'GPE', 'FAC', 'PRODUCT'}))

TP, TN, FP, FN = 0, 0, 0, 0
for index, row in df.iterrows():
    if is_relevant(row) and row['entities'] != '':
        TP += 1
    elif is_relevant(row) and row['entities'] == '':
        FP += 1
    elif not is_relevant(row) and row['entities'] == '':
        TN += 1
    elif not is_relevant(row) and row['entities'] != '':
        print(row['nlp'].ents, [ent.label_ for ent in row['nlp'].ents], row['entities'], row['nlp'].text)
        FN += 1
print(TP, TN, FP, FN)
array = [[TP, TN],
        [FP, FN]]
plot = make_confusion_matrix(np.array(array),
                      group_names=labels,
                      categories=categories,
                      filename='../plots/ner_matrix_prefilter.png')

Draw confusion matricies.

In [None]:
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df_cm = pd.DataFrame(array, index = [i for i in "TF"],
                  columns = [i for i in "PN"])
plt.figure(figsize = (10,7))
sn.heatmap(df_cm, annot=True)


In [None]:
array = [[TN, FP],
        [FN, TP]]
from prefilter_results_raw import make_confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
labels = ['True Neg','False Pos','False Neg','True Pos']
categories = ['Zero', 'One']
plot = make_confusion_matrix(np.array(array),
                      group_names=labels,
                      categories=categories,
                      filename='../plots/test.png')
























In [None]:
def make_confusion_matrix(cf,
                          group_names=None,
                          categories='auto',
                          count=True,
                          percent=True,
                          cbar=True,
                          xyticks=True,
                          xyplotlabels=True,
                          sum_stats=True,
                          figsize=None,
                          cmap='Blues',
                          title=None,
                          filename=None):
    '''
    This function will make a pretty plot of an sklearn Confusion Matrix cm using a Seaborn heatmap visualization.
    Arguments
    ---------
    cf:            confusion matrix to be passed in
    group_names:   List of strings that represent the labels row by row to be shown in each square.
    categories:    List of strings containing the categories to be displayed on the x,y axis. Default is 'auto'
    count:         If True, show the raw number in the confusion matrix. Default is True.
    normalize:     If True, show the proportions for each category. Default is True.
    cbar:          If True, show the color bar. The cbar values are based off the values in the confusion matrix.
                   Default is True.
    xyticks:       If True, show x and y ticks. Default is True.
    xyplotlabels:  If True, show 'True Label' and 'Predicted Label' on the figure. Default is True.
    sum_stats:     If True, display summary statistics below the figure. Default is True.
    figsize:       Tuple representing the figure size. Default will be the matplotlib rcParams value.
    cmap:          Colormap of the values displayed from matplotlib.pyplot.cm. Default is 'Blues'
                   See http://matplotlib.org/examples/color/colormaps_reference.html

    title:         Title for the heatmap. Default is None.
    filename:      Filename.
    '''

    # CODE TO GENERATE TEXT INSIDE EACH SQUARE
    blanks = ['' for i in range(cf.size)]

    if group_names and len(group_names) == cf.size:
        group_labels = ["{}\n".format(value) for value in group_names]
    else:
        group_labels = blanks

    if count:
        group_counts = ["{0:0.0f}\n".format(value) for value in cf.flatten()]
    else:
        group_counts = blanks

    if percent:
        group_percentages = ["{0:.2%}".format(value) for value in cf.flatten() / np.sum(cf)]
    else:
        group_percentages = blanks

    box_labels = [f"{v1}{v2}{v3}".strip() for v1, v2, v3 in zip(group_labels, group_counts, group_percentages)]
    box_labels = np.asarray(box_labels).reshape(cf.shape[0], cf.shape[1])

    # CODE TO GENERATE SUMMARY STATISTICS & TEXT FOR SUMMARY STATS
    if sum_stats:
        # Accuracy is sum of diagonal divided by total observations
        accuracy = np.trace(cf) / float(np.sum(cf))

        # if it is a binary confusion matrix, show some more stats
        if len(cf) == 2:
            # Metrics for Binary Confusion Matrices
            precision = cf[1, 1] / sum(cf[:, 1])
            recall = cf[1, 1] / sum(cf[1, :])
            f1_score = 2 * precision * recall / (precision + recall)
            stats_text = "\n\nAccuracy={:0.3f}\nPrecision={:0.3f}\nRecall={:0.3f}\nF1 Score={:0.3f}".format(
                accuracy, precision, recall, f1_score)
        else:
            stats_text = "\n\nAccuracy={:0.3f}".format(accuracy)
    else:
        stats_text = ""

    # SET FIGURE PARAMETERS ACCORDING TO OTHER ARGUMENTS
    if figsize == None:
        # Get default figure size if not set
        figsize = plt.rcParams.get('figure.figsize')

    if xyticks == False:
        # Do not show categories if xyticks is False
        categories = False

    # MAKE THE HEATMAP VISUALIZATION
    plt.figure(figsize=figsize)
    sns.heatmap(cf, annot=box_labels, fmt="", cmap=cmap, cbar=cbar, xticklabels=categories, yticklabels=categories)

    if xyplotlabels:
        plt.ylabel('True label')
        plt.xlabel('Predicted label' + stats_text)
    else:
        plt.xlabel(stats_text)

    if title:
        plt.title(title)

    if filename:
        plt.savefig(filename)