In [1]:
import gensim
import pandas as pd
import tensorflow as tf
import numpy as np

In [2]:
df= pd.read_csv("sentimentsdata_V3.csv")
df = df.rename(columns={'News Article': 'Article'})

In [3]:
df['Remark'] = df['Remark'].replace({'sadness': 'Sadness'})

In [4]:
df['Remark'].unique()

array(['Anger', 'Disgust', 'Fear', 'Sadness', 'Anticipation', 'Joy',
       'Surprise', 'Trust'], dtype=object)

In [5]:
columns_to_drop = ['Unnamed: 0', 'Unnamed: 0.1']
df = df.drop(columns=columns_to_drop)

In [6]:
#remove the articles which have neither panzer nor leopard in them 
df=df.drop(118)
df=df.drop(239)
df=df.drop(583)

In [7]:
import fasttext.util
fasttext.util.download_model('de', if_exists='ignore')
ft = fasttext.load_model('cc.de.300.bin')



In [10]:
Articles = df.Article.apply(gensim.utils.simple_preprocess)

In [11]:
def extract_from_text(texts, delta=30):
    sections = []
    for words in texts:
        parts = ""
        keyword_idx = []
        for count, i in enumerate(words):
            if ("panzer" in i) or ("leopard" in i):
                keyword_idx.append(count)
        intervals = []
        for i in keyword_idx:
            intervals.append([i - delta, i + delta])
        if len(intervals) > 0:
            merged_intervals = [intervals[0]]
            for i in range(1, len(intervals)):
                current_interval = intervals[i]
                last_merged = merged_intervals[-1]
                if current_interval[0] <= last_merged[1]:
                    merged_intervals[-1] = [last_merged[0], max(last_merged[1], current_interval[1])]
                else:
                    merged_intervals.append(current_interval)

            for i in merged_intervals:
                part = []
                if i[0] < 0:
                    i[0] = 0
                for j in range(i[0], i[1]+1):
                    if j >= len(words):
                        break
                    part.append(words[j])

                parts += " | " + " ".join(part)

            sections.append(parts)

    return sections

extracted = extract_from_text(Articles)

In [12]:
Articles_processed = [gensim.utils.simple_preprocess(section) for section in extracted]

In [13]:
def tokens_to_embeddings(tokens, model):
    embeddings = []
    for token in tokens:
        if token in ft:
            embeddings.append(ft[token])
        else:
            embeddings.append([0.0] * 300)
    return embeddings

X = [tokens_to_embeddings(tokens, ft) for tokens in Articles_processed]

In [13]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len=300
X_padded = pad_sequences(X, maxlen=max_len, dtype='float32', padding='post', truncating='post')


In [14]:
from sklearn.model_selection import train_test_split
import numpy as np

x_train, x_test, y_train, y_test = train_test_split(X_padded, df['Sentiment'].values, test_size=0.2)


In [15]:
from tensorflow.keras.utils import to_categorical

num_classes = 8

y_train_encoded = to_categorical(y_train, num_classes=num_classes)
y_test_encoded=to_categorical(y_test, num_classes=num_classes)
y_test_encoded.shape

(151, 8)

In [28]:
x_train.shape

(602, 300, 300)

In [29]:
x_test.shape

(151, 300, 300)

In [30]:
y_train_encoded.shape

(602, 8)

In [31]:
y_test_encoded.shape

(151, 8)

In [16]:
from keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, BatchNormalization

model_lstm= Sequential()
model_lstm.add(LSTM(256, return_sequences=True))
model_lstm.add(Dropout(0.3))
model_lstm.add(LSTM(256, return_sequences=False))
model_lstm.add(Dense(8, activation='softmax'))  

In [17]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [18]:
model.fit(x_train, y_train_encoded, validation_data=(x_test,y_test_encoded), epochs=5, batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x269c702d970>