# Context Anchoring Model


This is a staging notebook for exploring the use of context and word anchoring anchoring approach.

See issue [#91](https://github.com/JherezTaylor/thesis-preprocessing/issues/91) and [#93](https://github.com/JherezTaylor/thesis-preprocessing/issues/93)

In [1]:
from modules.utils import file_ops
import pprint
from modules.utils import settings
from modules.db import elasticsearch
import pandas as pd
from elasticsearch import Elasticsearch
import spacy
from nltk.corpus import wordnet as wn
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from tqdm import tqdm
import plotly as py
import plotly.graph_objs as go
from modules.utils.CustomTwokenizer import CustomTwokenizer

## Sentence Anchoring

### Example tweets

In [5]:
tweet_samples = ['user_mention I hope you get raped in jail', 'My faggot ass manager showed his colors again today son', '#WhiteGenocide wake the fuck up people these fucking barbaric niggers , radical goat fuckers , and wetbacks are taking over your country']
for idx,twt in enumerate(tweet_samples):
    print("Tweet " + str(idx)+ ": " + twt)

Tweet 0: user_mention I hope you get raped in jail
Tweet 1: My faggot ass manager showed his colors again today son
Tweet 2: #WhiteGenocide wake the fuck up people these fucking barbaric niggers , radical goat fuckers , and wetbacks are taking over your country


### Init Elasticsearch and VADER Sentiment Analyzer

In [6]:
es_index = 'candidates_crowdflower_analysis'
es = Elasticsearch([
    {'host': '140.114.79.146', 'port': 9200},
])
if not es.ping():
    raise ValueError("Connection failed")

sent_analyzer = SIA()

### Setup Elasticsearch queries

In [18]:
queries = []
for tweet in tweet_samples:
    query = {
        "stored_fields" : ["text"],
        "query": {
            "more_like_this" : {
                "fields" : ["text"],
                "like" : tweet,
                "min_term_freq" : 1,
                "max_query_terms" : 12
            }
        }
    }
    queries.append(query)

### Get similar queries

In [34]:
similar_tweets = []
for query in queries:
    res = es.search(index=es_index, body=query)
    similar_tweets.append([hit['fields']['text'][0] for hit in res['hits']['hits']])

In [35]:
similar_tweets[0]

['user_mention I hope you get raped in jail ',
 'user_mention I hope you get raped in jail ',
 "user_mention user_mention I hope Zoey doesn't get raped or killed !! #DontHurtJoey !! #LoveHer #Bayhem #Joey ",
 'user_mention lol your child can still get raped . _\x89ã¢\x89Û__ hope u n ya mammie get raped too_\x89ã¢\x89Û_Ì_ fym ? You finna see .. ',
 'user_mention user_mention get stage 7.666 cancer you nigger I hope you die in a head on collision in a car ',
 'Yes I realize women get raped and all but there are more male homicide victims than women . Men usually serve longer time in jail ',
 'user_mention Hahahahahahaha I hope seshes and that get put it in this you fucking queer ',
 'user_mention user_mention user_mention imagine me saying " proud & unafraid undocumented " in Mexico , i prob would get raped b4 i get dported ',
 'user_mention Sometimes I will add an emoji . I hope you and your mother get raped together , having to hear the other cry for help lol :) ',
 'user_mention but y

### Calculate sentiment scores

In [12]:
sentiment_scores = []
for tweets in similar_tweets:
    sentiment_scores.append([sent_analyzer.polarity_scores(twt) for twt in tweets])
pprint.pprint(sentiment_scores[0])

[{'compound': -0.4019, 'neg': 0.368, 'neu': 0.4, 'pos': 0.232},
 {'compound': -0.4019, 'neg': 0.368, 'neu': 0.4, 'pos': 0.232},
 {'compound': 0.4993, 'neg': 0.186, 'neu': 0.495, 'pos': 0.319},
 {'compound': -0.6705, 'neg': 0.298, 'neu': 0.518, 'pos': 0.184},
 {'compound': -0.9217, 'neg': 0.503, 'neu': 0.4, 'pos': 0.097},
 {'compound': -0.648, 'neg': 0.22, 'neu': 0.714, 'pos': 0.066},
 {'compound': 0.4404, 'neg': 0.0, 'neu': 0.818, 'pos': 0.182},
 {'compound': -0.3612, 'neg': 0.194, 'neu': 0.675, 'pos': 0.131},
 {'compound': 0.4019, 'neg': 0.208, 'neu': 0.485, 'pos': 0.307},
 {'compound': 0.7722, 'neg': 0.243, 'neu': 0.332, 'pos': 0.425}]


### Bin tweets by compound score [Postive, Negative, Neutral]

In [13]:
for idx, tweet_set in enumerate(sentiment_scores):
    for idx_y, tweet in enumerate(tweet_set):
        if tweet['compound'] >= 0.5:
            label = 'POS'
        elif tweet['compound'] > -0.5 and tweet['compound'] < 0.5:
            label = 'NEU'
        else:
            label = 'NEG'
        sentiment_scores[idx][idx_y]['label'] = label

sentiment_scores[0][1]

{'compound': -0.4019, 'label': 'NEU', 'neg': 0.368, 'neu': 0.4, 'pos': 0.232}

### Calculate features

In [14]:
positive_summation = []
for tweet_set in sentiment_scores:
    positive_summation.append(sum(twt['pos']) for twt in tweet_set)

positive_summation

[<generator object <genexpr> at 0x7f9df855edb0>,
 <generator object <genexpr> at 0x7f9df855ed58>,
 <generator object <genexpr> at 0x7f9df8532360>]