# import necessary packages

In [1]:
import numpy as np
import pandas as pd
import json

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

import tensorflow as tf
from tensorflow.keras.preprocessing.text import tokenizer_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Mount drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Load data

In [5]:
# path
base = '/content/drive/My Drive/Hatefulle Ytringer Models/'

# load data
unlabel = pd.read_csv(base+'models/Models 16_10_2021/comments_SIAN_post_id.csv', encoding = "UTF-8")
unlabel.head()

Unnamed: 0,post_id,comment_id,author_id,time,text,answers
0,1917260805119180,1917262545119006,b197158e1e9e67272ba8a797616514cc,on Wed,Det er helt korrekt dette... disse 2 hadde et ...,
1,1917260805119180,1917263215118939,b197158e1e9e67272ba8a797616514cc,on Wed,Men dette er det mange som enten ikke veit ell...,
2,1917260805119180,1917345138444080,77c2ce38e63a629436ac616aec515ad1,on Wed,Islamsk nazist\n\nRopet til nazistisk islam (A...,
3,1917260805119180,1917438355101425,274b68192b056e268f128ff63bfcd4a4,on Wed,Rebecca Romano,
4,1917260805119180,1917473961764531,274b68192b056e268f128ff63bfcd4a4,on Wed,Rebecca Romano,


In [6]:
# drop answer column
unlabel = unlabel[['post_id', 'comment_id', 'author_id', 'text']]

# drop null rows
unlabel = unlabel.dropna()
unlabel

Unnamed: 0,post_id,comment_id,author_id,text
0,1917260805119180,1917262545119006,b197158e1e9e67272ba8a797616514cc,Det er helt korrekt dette... disse 2 hadde et ...
1,1917260805119180,1917263215118939,b197158e1e9e67272ba8a797616514cc,Men dette er det mange som enten ikke veit ell...
2,1917260805119180,1917345138444080,77c2ce38e63a629436ac616aec515ad1,Islamsk nazist\n\nRopet til nazistisk islam (A...
3,1917260805119180,1917438355101425,274b68192b056e268f128ff63bfcd4a4,Rebecca Romano
4,1917260805119180,1917473961764531,274b68192b056e268f128ff63bfcd4a4,Rebecca Romano
...,...,...,...,...
25581,638243483020925,638315766347030,09aee10bce10c937d9e33452486f09c8,Dette bilde har ingenting m innlegget å gjøre....
25582,638243483020925,638838086294798,afd2a1de527d52d1c5d31fea96b65fbc,dette er en helt annen sak tror eg.
25583,637654413079832,637701513075122,77b1fe423e999e92a12e4cd5024241ef,Det går nok ikke lenge Før tullingene får gjor...
25584,637501219761818,637667176411889,4a5af9abb22604ac3f0d0452a8cb5c43,Mange idioter.


In [8]:
def process_tweet(df):
    '''
    Input: 
        df: a dataframe containing a column 'text' of strings of tweets
    Output:
        df with a column 'tweets_clean'
    
    '''
    #remove URL
    df['tweet_proc'] = df['text'].str.replace(r'http(\S)+', r'')
    df['tweet_proc'] = df['tweet_proc'].str.replace(r'http ...', r'')
    df['tweet_proc'] = df['tweet_proc'].str.replace(r'http', r'')
    df[df['tweet_proc'].str.contains(r'http')]

    # remove RT, @
    df['tweet_proc'] = df['tweet_proc'].str.replace(r'(RT|rt)[ ]*@[ ]*[\S]+',r'')
    df[df['tweet_proc'].str.contains(r'RT[ ]?@')]
    df['tweet_proc'] = df['tweet_proc'].str.replace(r'@[\S]+',r'')

    #remove &, < og >
    df['tweet_proc'] = df['tweet_proc'].str.replace(r'&amp;?',r'og')
    df['tweet_proc'] = df['tweet_proc'].str.replace(r'&lt;',r'<')
    df['tweet_proc'] = df['tweet_proc'].str.replace(r'&gt;',r'>')

    # remove extra space
    df['tweet_proc'] = df['tweet_proc'].str.replace(r'[ ]{2, }',r' ')

    # insert space between punctuation marks
    df['tweet_proc'] = df['tweet_proc'].str.replace(r'([\w\d]+)([^\w\d ]+)', r'\1 \2')
    df['tweet_proc'] = df['tweet_proc'].str.replace(r'([^\w\d ]+)([\w\d]+)', r'\1 \2')

    # lower case and strip white spaces at both ends
    df['tweet_proc'] = df['tweet_proc'].str.lower()
    df['tweet_proc'] = df['tweet_proc'].str.strip()

    return df

# clean text

In [9]:
unlabel = process_tweet(unlabel)

# convert emoji into words and remove non-alphabetic characters

In [10]:
unlabel['tweet_proc'] = unlabel['tweet_proc'].str.replace(':-\)', 'smile')
unlabel['tweet_proc'] = unlabel['tweet_proc'].str.replace(':-\(', 'trist')
unlabel['tweet_proc'] = unlabel['tweet_proc'].str.replace(r'[^a-zåøæ ]', '')

# Remove stop words

In [12]:
stop_words = stopwords.words('norwegian')
stop_words.remove('ikke')
stop_words.remove('ikkje')

unlabel['tweet_proc'] = unlabel['tweet_proc'].apply(lambda x:' '.join(w for w in x.split() if w not in stop_words))

In [13]:
unlabel

Unnamed: 0,post_id,comment_id,author_id,text,tweet_proc
0,1917260805119180,1917262545119006,b197158e1e9e67272ba8a797616514cc,Det er helt korrekt dette... disse 2 hadde et ...,helt korrekt samarbeid vennskap under krigen f...
1,1917260805119180,1917263215118939,b197158e1e9e67272ba8a797616514cc,Men dette er det mange som enten ikke veit ell...,enten ikke veit benekter alt verdt gjør mye
2,1917260805119180,1917345138444080,77c2ce38e63a629436ac616aec515ad1,Islamsk nazist\n\nRopet til nazistisk islam (A...,islamsk nazist ropet nazistisk islam allah akb...
3,1917260805119180,1917438355101425,274b68192b056e268f128ff63bfcd4a4,Rebecca Romano,rebecca romano
4,1917260805119180,1917473961764531,274b68192b056e268f128ff63bfcd4a4,Rebecca Romano,rebecca romano
...,...,...,...,...,...
25581,638243483020925,638315766347030,09aee10bce10c937d9e33452486f09c8,Dette bilde har ingenting m innlegget å gjøre....,bilde ingenting m innlegget gjøre gutten misha...
25582,638243483020925,638838086294798,afd2a1de527d52d1c5d31fea96b65fbc,dette er en helt annen sak tror eg.,helt annen sak tror
25583,637654413079832,637701513075122,77b1fe423e999e92a12e4cd5024241ef,Det går nok ikke lenge Før tullingene får gjor...,går nok ikke lenge tullingene får gjort riksda...
25584,637501219761818,637667176411889,4a5af9abb22604ac3f0d0452a8cb5c43,Mange idioter.,idioter


# load trained tokanizer

In [14]:
with open('/content/drive/MyDrive/Hatefulle Ytringer Models/models/Models 16_10_2021/tokenizer.json_16102021_v1') as f:
    data = json.load(f)
    tokenizer_trained = tokenizer_from_json(data)

    vocab_size = len(tokenizer_trained.word_index) + 1  # Adding 1 because of reserved 0 index
print('vocab_size: ', vocab_size)

vocab_size:  11852


# convert words into sequences

In [15]:
unseen = tokenizer_trained.texts_to_sequences(unlabel['tweet_proc'].values)
unseen = pad_sequences(unseen, maxlen=128, padding='post', truncating='post')

# load trained model

In [16]:
 model = tf.keras.models.load_model(base+'models/model/')

# predict result of unseen data using trained model

In [17]:
yhat = model.predict(unseen)
yhat = [1 if y>0.55 else 0 for y in yhat]
unlabel['Result'] = yhat
unlabel

Unnamed: 0,post_id,comment_id,author_id,text,tweet_proc,Result
0,1917260805119180,1917262545119006,b197158e1e9e67272ba8a797616514cc,Det er helt korrekt dette... disse 2 hadde et ...,helt korrekt samarbeid vennskap under krigen f...,1
1,1917260805119180,1917263215118939,b197158e1e9e67272ba8a797616514cc,Men dette er det mange som enten ikke veit ell...,enten ikke veit benekter alt verdt gjør mye,1
2,1917260805119180,1917345138444080,77c2ce38e63a629436ac616aec515ad1,Islamsk nazist\n\nRopet til nazistisk islam (A...,islamsk nazist ropet nazistisk islam allah akb...,0
3,1917260805119180,1917438355101425,274b68192b056e268f128ff63bfcd4a4,Rebecca Romano,rebecca romano,1
4,1917260805119180,1917473961764531,274b68192b056e268f128ff63bfcd4a4,Rebecca Romano,rebecca romano,1
...,...,...,...,...,...,...
25581,638243483020925,638315766347030,09aee10bce10c937d9e33452486f09c8,Dette bilde har ingenting m innlegget å gjøre....,bilde ingenting m innlegget gjøre gutten misha...,1
25582,638243483020925,638838086294798,afd2a1de527d52d1c5d31fea96b65fbc,dette er en helt annen sak tror eg.,helt annen sak tror,1
25583,637654413079832,637701513075122,77b1fe423e999e92a12e4cd5024241ef,Det går nok ikke lenge Før tullingene får gjor...,går nok ikke lenge tullingene får gjort riksda...,0
25584,637501219761818,637667176411889,4a5af9abb22604ac3f0d0452a8cb5c43,Mange idioter.,idioter,1


In [18]:
# number of prediction belongs to each class
unlabel['Result'].value_counts()

1    22047
0     3535
Name: Result, dtype: int64

In [None]:
# save the result 
unlabel.to_csv(base+'models/Models 16_10_2021/comments_SIAN_post_id_with_results_16_10_2021_model_ANN_with_more_data2-09-0.7967.csv', index=False)