In [7]:
import spacy
from pre_processing import load_text_data
spacy_en = spacy.load('en_core_web_sm')


In [8]:
def toxic_mining_mi():
    import numpy as np
    from sklearn.feature_selection import mutual_info_classif
    from sklearn.feature_extraction.text import TfidfVectorizer

    tweets, labels = load_text_data()

    vec = TfidfVectorizer(ngram_range=(2, 2), tokenizer=None, max_features=10000,
                          min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
                          smooth_idf=1, sublinear_tf=1, stop_words='english')
    tweets_tf_idf = vec.fit_transform(tweets)

    mi = mutual_info_classif(tweets_tf_idf, labels, n_neighbors=3, random_state=2020)

    features = np.array(vec.get_feature_names())

    top_mi_toxic = np.argsort(mi)[::-1][:200]
    print(features[top_mi_toxic])

In [9]:
def toxic_mining_topic():

    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.decomposition import LatentDirichletAllocation

    tweets, _ = load_text_data()

    tf_vector = CountVectorizer(max_df=0.95, min_df=2, max_features=10000, stop_words='english')
    tf = tf_vector.fit_transform(tweets)
    tf_feature_names = tf_vector.get_feature_names()

    lda = LatentDirichletAllocation(n_components=5, max_iter=5, learning_offset=50., random_state=2020).fit(tf)

    def display_topics(model, feature_names, no_top_words):
        for topic_idx, topic in enumerate(model.components_):
            print("Topic %d:" % topic_idx)
            print(" ".join([feature_names[i]
                            for i in topic.argsort()[:-no_top_words - 1:-1]]))

    n_top_words = 15
    display_topics(lda, tf_feature_names, n_top_words)

In [10]:
def toxic_mining_targeted(queries):
    import numpy as np
    from utils.help import query_text
    from utils.sentiment_lexicon import load_lexicon
    ds, tweets, tweets_raw = load_text_data()
    lexicon = load_lexicon('neg')

    selected_words = set()
    for tweet in tweets:
        if query_text(tweet, queries):
            found = lexicon & set(tweet.split())
            selected_words |= found
    for q in queries:
        try:
            selected_words.remove(q)
        except KeyError:
            continue
    print(len(selected_words))

In [11]:
def tf_idf_score(queries):
    import numpy as np
    from utils.help import query_text
    from sklearn.feature_extraction.text import TfidfVectorizer

    tweets, _ = load_text_data()

    selected_tweets = []
    for tweet in tweets:
        if query_text(tweet, queries):
            selected_tweets.append(tweet)

    vec = TfidfVectorizer(ngram_range=(1,1), tokenizer=None, max_features=10000,
               min_df=1, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1, stop_words='english')

    scores = vec.fit_transform(selected_tweets)
    features = np.array(vec.get_feature_names())
    scores = scores.max(axis=0).toarray()[0]
    # scores = scores.mean(axis=0).getA1()
    # lookup = dict(zip(features, scores))

    top = np.argsort(scores)[::-1][:200]
    return features[top]

In [12]:
toxic_words = toxic_mining_targeted(['immigrant', 'migrant'])
print(toxic_words)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 12863 entries, 0 to 2964
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          12863 non-null  int64 
 1   large_text  12863 non-null  object
 2   small_text  12863 non-null  object
dtypes: int64(1), object(2)
memory usage: 402.0+ KB
loaded 2387 neg words from NRC Emotion Word Level Lexicon.
loaded 1522 pos words from NRC Emotion Word Level Lexicon.
loaded 1315 neg words from NRC Affect Intensity Lexicon.
loaded 653 pos words from NRC Affect Intensity Lexicon.
loaded 4152 neg words from MPQA Lexicon.
loaded 2304 pos words from MPQA Lexicon.
loaded 4783 neg words from Liu Lexicon.
loaded 2006 pos words from Liu Lexicon.
540
['perilous' 'horrible' 'illness' 'beware' 'lash' 'volunteer' 'interested'
 'mutilation' 'murderous' 'guilt' 'agonizing' 'chaos' 'death' 'invasion'
 'abolish' 'youth' 'perjury' 'objection' 'lie' 'phony' 'flagrant' 'steal'
 'anxious' 'tension' 'c

In [13]:
# from nltk.corpus import wordnet as wn
# for word in toxic_words:
#     antonyms_set = set()
#     for syn in wn.synsets(word):
#        for l in syn.lemmas():
#            if l.antonyms():
#                antonyms_set.add(l.antonyms()[0].name())
#     if len(antonyms_set) > 0:
#         print(word, antonyms_set)

In [14]:
# tf_idf_score(['immigrant', 'migrant'])
