In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

import pickle, gensim, numpy as np

from utilities import get_train_data, get_test_data, Tokenizer, find_subtoken

PICKLE_FOLDER_PATH = '/home/lisa/Darmstadt/Master Arbeit/06_Analyse/Learning_Alg/GermEval-2018-Data/'

TRAIN_FILENAME = '/home/lisa/Darmstadt/Master Arbeit/06_Analyse/germeval2018.training.txt'
TEST_FILENAME  = '/home/lisa/Darmstadt/Master Arbeit/06_Analyse/germeval2018.test.txt'
mdp = '/home/lisa/Darmstadt/Master Arbeit/06_Analyse/mdp_tweets.txt'

#------------------------------
#source:
#http://www.cl.uni-heidelberg.de/english/research/downloads/resource_pages/GermanTwitterEmbeddings/GermanTwitterEmbeddings_data.shtml
MODEL_FILENAME  = "/home/lisa/Darmstadt/Master Arbeit/06_Analyse/twitter-de_d100_w5_min10.bin" # 821,8 MB
MODEL_DIMENSION = 100

In [2]:
%%time
word2vec_model  = gensim.models.KeyedVectors.load_word2vec_format(MODEL_FILENAME, binary=True)

X_train, y_train_t1, y_train_t2 = get_train_data(TRAIN_FILENAME)
X_test                          = get_test_data(TEST_FILENAME)

CPU times: user 9.97 s, sys: 672 ms, total: 10.6 s
Wall time: 10.7 s


In [8]:
#word2vec_model.vocab
#word2vec_model['laufe']
word2vec_model.vector_size

100

In [14]:
X_test_mdp = get_test_data(mdp)
# X_test_mdp[X_test_mdp == ' ']
X_test_mdp = X_test_mdp[X_test_mdp != '']
X_test_mdp = X_test_mdp[X_test_mdp != ' ']

In [16]:
X_test

array(['Meine Mutter hat mir erzählt, dass mein Vater einen Wahlkreiskandidaten nicht gewählt hat, weil der gegen die Homo-Ehe ist ☺\tOTHER\tOTHER',
       '@Tom174_ @davidbest95 Meine Reaktion; |LBR| Nicht jeder Moslem ist ein Terrorist. Aber jeder Moslem glaubt an Überlieferungen, die Gewalt und Terror begünstigen.\tOTHER\tOTHER',
       '#Merkel rollt dem Emir von #Katar, der islamistischen Terror unterstützt, den roten Teppich aus.Wir brauchen einen sofortigen #Waffenstopp!\tOTHER\tOTHER',
       ...,
       '@podilein Mannheim, weltoffen und kunterbunt. Und strunzdumme Multikultiliebhaber. Verblödete halt\tOFFENSE\tABUSE',
       '@stephanweil was ist nun mit kostenlosen Kitas in der GROKO ???\tOTHER\tOTHER',
       '@Bartzissey Denn Minderheiten sind für Linke ja nur dann interessant, wenn sie gegen den Westen instrumentalisiert werden können.\tOTHER\tOTHER'],
      dtype='<U544')

### NGRAM FEATURES
 * Erstelle n-Gramme mit 3-7 Buchstaben (Funktion: char_vect)
 * Erstelle n-Gramme mit 1-3 Wörtern /Funkion: token_vect
 * Anwendung auf Training/ Test und mdp Daten

In [17]:
char_vect  = TfidfVectorizer(analyzer="char", ngram_range=(3, 7), max_df=0.01, min_df=0.0002,
                             preprocessor=Tokenizer(preserve_case=False, join=True).tokenize)

token_vect = TfidfVectorizer(analyzer="word", ngram_range=(1, 3), max_df=0.01, min_df=0.0002,
                             tokenizer=Tokenizer(preserve_case=False, use_stemmer=True).tokenize)

X_CNGR_train = char_vect.fit_transform(X_train)
X_CNGR_test  = char_vect.transform(X_test)

X_TNGR_train = token_vect.fit_transform(X_train)
X_TNGR_test  = token_vect.transform(X_test)

In [7]:
# pickle.dump(X_CNGR_train, open(PICKLE_FOLDER_PATH + "X_CNGR_train.p", "wb" ))
# pickle.dump(X_CNGR_test,  open(PICKLE_FOLDER_PATH + "X_CNGR_test.p", "wb" ))

# pickle.dump(X_TNGR_train, open(PICKLE_FOLDER_PATH + "X_TNGR_train.p", "wb" ))
# pickle.dump(X_TNGR_test,  open(PICKLE_FOLDER_PATH + "X_TNGR_test.p", "wb" ))

In [18]:
X_CNGR_mdp = char_vect.transform(X_test_mdp)
X_TNGR_mdp  = token_vect.transform(X_test_mdp)

In [21]:
print(X_CNGR_mdp.shape)

(9958, 207575)


In [10]:
pickle.dump(X_CNGR_mdp,  open(PICKLE_FOLDER_PATH + "X_CNGR_mdp.p", "wb" ))
pickle.dump(X_TNGR_mdp, open(PICKLE_FOLDER_PATH + "X_TNGR_mdp.p", "wb" ))

### EMB FEATURES
* Tweets werden in Token unterteilt
* Prüfe ob die Token in einem Token im vortrainierten word2vec Model entsprechen
* Wenn nicht, teile Token in Präfix und Suffix und prüfe für diese das word2vec Model (ggf. beide in emb)
* emb enthält pro Tweet Vektoren für Token und wird normalisiert mit der Länge des Tweets + ggf extra Tokens
* X_EMB enthält die normalisierten Vektoren pro Tweet

In [22]:
def get_EMB_feats(tweets):   
    tknzr = Tokenizer(preserve_case=True)
    tweets = [tknzr.tokenize(tweet) for tweet in tweets]
    
    X_EMB = []

    for tweet in tweets:
        emb = np.zeros(MODEL_DIMENSION)
        extra_tokens = 0
        
        for token in tweet:
            try:
                emb += word2vec_model[token]
            except:
                prefix = find_subtoken(token, word2vec_model, mode='initial')
                suffix = find_subtoken(token, word2vec_model, mode='final')
                    
                if prefix != None and suffix != None:
                    emb += word2vec_model[prefix] + word2vec_model[suffix]
                    extra_tokens += 1
                elif prefix != None and suffix == None:
                    emb += word2vec_model[prefix]
                elif prefix == None and suffix != None:
                    emb += word2vec_model[suffix]
                    
        emb /= (len(tweet) + extra_tokens)
        X_EMB.append(emb)
        
    return normalize(X_EMB)

In [23]:
%%time
X_EMB_train = get_EMB_feats(X_train)
X_EMB_test  = get_EMB_feats(X_test)

CPU times: user 2.09 s, sys: 4.19 ms, total: 2.09 s
Wall time: 2.1 s


In [24]:
X_EMB_mdp  = get_EMB_feats(X_test_mdp)

In [25]:
print(X_EMB_test.shape)

(3532, 100)


In [26]:
pickle.dump(X_EMB_train, open(PICKLE_FOLDER_PATH + "X_EMB_train.p", "wb" ))
pickle.dump(X_EMB_test,  open(PICKLE_FOLDER_PATH + "X_EMB_test.p", "wb" ))

In [16]:
pickle.dump(X_EMB_mdp,  open(PICKLE_FOLDER_PATH + "X_EMB_mdp.p", "wb" ))

### TIMP FEATURES
* Finden der wichtigen Tokens - also derer die in Tweets der angegebenen Kategorie verwendet werden
* Für diese wichtigsten Tokens werden die Features analog der EMB Features aus dem word2vec model abgeleitet
* Außerdem werden für alle Tweets analog der EMB feats abgeleitet
* Vergleiche mit der Cosine Similarity und gebe die höchsten und niedrigsten Werte pro Tweet zurück


In [27]:
def k_most_imp_tokenlvl(k, category, max_df=0.01, min_df=0.0002):      
    token_vect = TfidfVectorizer(analyzer="word", ngram_range=(1, 1), lowercase=False,
                                 max_df=max_df, min_df=min_df,
                                 tokenizer=Tokenizer(preserve_case=True).tokenize)
    
    tfidf = token_vect.fit_transform(X_train)
    
    vocab = token_vect.vocabulary_
    inv_vocab = {index: word for word, index in vocab.items()}
    
    if category in ['OTHER', 'OFFENSE']:
        cat_ids = np.where(y_train_t1 == category)
    elif category in ['PROFANITY', 'ABUSE', 'INSULT']:
        cat_ids = np.where(y_train_t2 == category)
        
    most_imp_ids = np.argsort(np.asarray(np.mean(tfidf[cat_ids], axis=0)).flatten())[::-1]
        
    most_imp = []
    for index in most_imp_ids:
        most_imp.append(inv_vocab[index])

    return most_imp[:k]

def get_TIMP_feats(tweets, k, category, max_df=0.01, min_df=0.0002):
    feats_max = []
    feats_min = []
           
    imp_tokens_vectors = []
    for imp_token in k_most_imp_tokenlvl(k, category, max_df=max_df, min_df=min_df):
        try:
            imp_tokens_vectors.append(word2vec_model[imp_token])
        except:
            imp_tokens_vectors.append(np.zeros(MODEL_DIMENSION))
    
    tknzr = Tokenizer(preserve_case=True)
    tweets = [tknzr.tokenize(tweet) for tweet in tweets]
    
    for tweet in tweets:
        tweet_vectors = []
        for token in tweet:
            try:
                tweet_vectors.append(word2vec_model[token])
            except:
                prefix = find_subtoken(token, word2vec_model, mode='initial')
                suffix = find_subtoken(token, word2vec_model, mode='final')
                 
                if prefix != None and suffix != None:
                    tweet_vectors.append(word2vec_model[prefix])
                    tweet_vectors.append(word2vec_model[suffix])
                elif prefix != None and suffix == None:
                    tweet_vectors.append(word2vec_model[prefix])
                elif prefix == None and suffix != None:
                    tweet_vectors.append(word2vec_model[suffix])
                else:
                    tweet_vectors.append(np.zeros(MODEL_DIMENSION))
                    
        similarity = cosine_similarity(np.asarray(tweet_vectors), np.asarray(imp_tokens_vectors))
        
        feats_max.append(np.amax(similarity, axis=0))
        feats_min.append(np.amin(similarity, axis=0))
        
    return np.concatenate((feats_max, feats_min), axis=1)

In [28]:
%%time
N_TIMP_TASK1 = 1250
N_TIMP_TASK2 = 170

X_TIMP_task1_train = \
np.concatenate((get_TIMP_feats(X_train, N_TIMP_TASK1, 'OTHER'),
                get_TIMP_feats(X_train, N_TIMP_TASK1, 'OFFENSE')), axis=1)

X_TIMP_task1_test = \
np.concatenate((get_TIMP_feats(X_test,  N_TIMP_TASK1, 'OTHER'),
                get_TIMP_feats(X_test,  N_TIMP_TASK1, 'OFFENSE')), axis=1)

X_TIMP_task2_train = \
np.concatenate((get_TIMP_feats(X_train, N_TIMP_TASK2, 'OTHER'),
                get_TIMP_feats(X_train, N_TIMP_TASK2, 'ABUSE'),
                get_TIMP_feats(X_train, N_TIMP_TASK2, 'INSULT'),
                get_TIMP_feats(X_train, N_TIMP_TASK2, 'PROFANITY')), axis=1)

X_TIMP_task2_test = \
np.concatenate((get_TIMP_feats(X_test,  N_TIMP_TASK2, 'OTHER'),
                get_TIMP_feats(X_test,  N_TIMP_TASK2, 'ABUSE'),
                get_TIMP_feats(X_test,  N_TIMP_TASK2, 'INSULT'),
                get_TIMP_feats(X_test,  N_TIMP_TASK2, 'PROFANITY')), axis=1)

CPU times: user 2min 13s, sys: 1.98 s, total: 2min 15s
Wall time: 1min 20s


In [30]:
N_TIMP_TASK1 = 1250
N_TIMP_TASK2 = 170

X_TIMP_task1_mdp = \
np.concatenate((get_TIMP_feats(X_test_mdp,  N_TIMP_TASK1, 'OTHER'),
                get_TIMP_feats(X_test_mdp,  N_TIMP_TASK1, 'OFFENSE')), axis=1)

X_TIMP_task2_mdp = \
np.concatenate((get_TIMP_feats(X_test_mdp,  N_TIMP_TASK2, 'OTHER'),
                get_TIMP_feats(X_test_mdp,  N_TIMP_TASK2, 'ABUSE'),
                get_TIMP_feats(X_test_mdp,  N_TIMP_TASK2, 'INSULT'),
                get_TIMP_feats(X_test_mdp,  N_TIMP_TASK2, 'PROFANITY')), axis=1)

In [33]:
print(X_TIMP_task2_mdp.shape)

(9958, 1360)


In [21]:
pickle.dump(X_TIMP_task1_train, open(PICKLE_FOLDER_PATH + "X_TIMP_task1_train.p", "wb" ))
pickle.dump(X_TIMP_task1_test,  open(PICKLE_FOLDER_PATH + "X_TIMP_task1_test.p", "wb" ))
pickle.dump(X_TIMP_task2_train, open(PICKLE_FOLDER_PATH + "X_TIMP_task2_train.p", "wb" ))
pickle.dump(X_TIMP_task2_test,  open(PICKLE_FOLDER_PATH + "X_TIMP_task2_test.p", "wb" ))

In [22]:
pickle.dump(X_TIMP_task1_mdp,  open(PICKLE_FOLDER_PATH + "X_TIMP_task1_mdp.p", "wb" ))
pickle.dump(X_TIMP_task2_mdp,  open(PICKLE_FOLDER_PATH + "X_TIMP_task2_mdp.p", "wb" ))

### CIMP FEATURES

In [34]:
def k_most_imp_charlvl(k, category, max_df=0.01, min_df=0.0002):    
    char_vect  = TfidfVectorizer(analyzer="char", ngram_range=(3, 7), lowercase=False,
                                 max_df=max_df, min_df=min_df,
                                 preprocessor=Tokenizer(preserve_case=True, join=True).tokenize)

    tfidf = char_vect.fit_transform(X_train)
    
    vocab = char_vect.vocabulary_
    inv_vocab = {index: word for word, index in vocab.items()}
    
    if category in ['OTHER', 'OFFENSE']:
        cat_ids = np.where(y_train_t1 == category)
    elif category in ['PROFANITY', 'ABUSE', 'INSULT']:
        cat_ids = np.where(y_train_t2 == category)       
        
    most_imp_ids = np.argsort(np.asarray(np.mean(tfidf[cat_ids], axis=0)).flatten())[::-1]
        
    most_imp = []
    for index in most_imp_ids:
        most_imp.append(inv_vocab[index])

    return most_imp[:k]

def get_CIMP_feats(tweets, k, category, max_df=0.01, min_df=0.0002):
    feats = np.zeros((len(tweets), k))
    for imp_ngram_index, imp_ngram in enumerate(k_most_imp_charlvl(k, category, max_df=max_df, min_df=min_df)):
        for tweet_index, tweet in enumerate(tweets):
            if tweet.find(imp_ngram) != -1:
                feats[tweet_index][imp_ngram_index] = 1
    return feats

In [35]:
%%time
N_CIMP_TASK1 = 3200
N_CIMP_TASK2 = 370
            
X_CIMP_task1_train = \
np.concatenate((get_CIMP_feats(X_train, N_CIMP_TASK1, 'OTHER'),
                get_CIMP_feats(X_train, N_CIMP_TASK1, 'OFFENSE')), axis=1)

X_CIMP_task1_test = \
np.concatenate((get_CIMP_feats(X_test,  N_CIMP_TASK1, 'OTHER'),
                get_CIMP_feats(X_test,  N_CIMP_TASK1, 'OFFENSE')), axis=1)

X_CIMP_task2_train = \
np.concatenate((get_CIMP_feats(X_train, N_CIMP_TASK2, 'OTHER'),
                get_CIMP_feats(X_train, N_CIMP_TASK2, 'ABUSE'),
                get_CIMP_feats(X_train, N_CIMP_TASK2, 'INSULT'),
                get_CIMP_feats(X_train, N_CIMP_TASK2, 'PROFANITY')), axis=1)

X_CIMP_task2_test = \
np.concatenate((get_CIMP_feats(X_test,  N_CIMP_TASK2, 'OTHER'),
                get_CIMP_feats(X_test,  N_CIMP_TASK2, 'ABUSE'),
                get_CIMP_feats(X_test,  N_CIMP_TASK2, 'INSULT'),
                get_CIMP_feats(X_test,  N_CIMP_TASK2, 'PROFANITY')), axis=1)

CPU times: user 4min 18s, sys: 1.35 s, total: 4min 20s
Wall time: 4min 20s


In [None]:
N_CIMP_TASK1 = 3200
N_CIMP_TASK2 = 370

X_CIMP_task1_mdp = \
np.concatenate((get_CIMP_feats(X_test_mdp,  N_CIMP_TASK1, 'OTHER'),
                get_CIMP_feats(X_test_mdp,  N_CIMP_TASK1, 'OFFENSE')), axis=1)

X_CIMP_task2_mdp = \
np.concatenate((get_CIMP_feats(X_test_mdp,  N_CIMP_TASK2, 'OTHER'),
                get_CIMP_feats(X_test_mdp,  N_CIMP_TASK2, 'ABUSE'),
                get_CIMP_feats(X_test_mdp,  N_CIMP_TASK2, 'INSULT'),
                get_CIMP_feats(X_test_mdp,  N_CIMP_TASK2, 'PROFANITY')), axis=1)

In [31]:
print(X_CIMP_task1_mdp.shape)

NameError: name 'X_CIMP_task1_mdp' is not defined

In [36]:
pickle.dump(X_CIMP_task1_train, open(PICKLE_FOLDER_PATH + "X_CIMP_task1_train.p", "wb" ))
pickle.dump(X_CIMP_task1_test,  open(PICKLE_FOLDER_PATH + "X_CIMP_task1_test.p", "wb" ))
pickle.dump(X_CIMP_task2_train, open(PICKLE_FOLDER_PATH + "X_CIMP_task2_train.p", "wb" ))
pickle.dump(X_CIMP_task2_test,  open(PICKLE_FOLDER_PATH + "X_CIMP_task2_test.p", "wb" ))

In [37]:
pickle.dump(X_CIMP_task1_mdp,  open(PICKLE_FOLDER_PATH + "X_CIMP_task1_mdp.p", "wb" ))
pickle.dump(X_CIMP_task2_mdp,  open(PICKLE_FOLDER_PATH + "X_CIMP_task2_mdp.p", "wb" ))