In [30]:
from tensorflow.keras.preprocessing.text import Tokenizer
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

In [3]:
data = pd.read_csv("all-data-processed.csv")

In [6]:
data

Unnamed: 0,Sentiment,Sentence,Clean sentences
0,1,The core of Solidium 's investment strategy is...,core solidium investment strategy proper value...
1,-1,`` The implementation of these programs has ha...,implementation programs negative impacts 2006 ...
2,-1,"ADPnews - Aug 3 , 2009 - Finnish media group I...",adpnews aug 3 2009 finnish media group ilkkayh...
3,-1,Operating loss amounted to EUR 0.9 mn in the f...,operating loss amounted eur 09 mn first half 2...
4,1,TomTom has given assurances that it will conti...,tomtom given assurances continue sell maps com...
...,...,...,...
2721,1,"Via the Satlan acquisition , Teleste plans to ...",via satlan acquisition teleste plans expand ma...
2722,-1,"However , the growth margin slowed down due to...",however growth margin slowed due financial crisis
2723,1,Outotec 's net profit for the second quarter o...,outotec net profit second quarter 2007 jumped ...
2724,1,The restructuring creates a more efficient org...,restructuring creates efficient organization i...


In [7]:
X_train, X_test, y_train, y_test = train_test_split(data["Clean sentences"], data["Sentiment"], test_size=0.1, random_state=42)
X_train.shape , X_test.shape, y_train.shape, y_test.shape

((2453,), (273,), (2453,), (273,))

tokenizing (str to int conversion)

In [11]:
token = Tokenizer()
token.fit_on_texts(X_train)

In [34]:
vocab = len(token.index_word) + 1
print("Vocabulary size={}".format(len(token.word_index)))
print("Number of Documents={}".format(token.document_count))

Vocabulary size=5656
Number of Documents=2453


In [14]:
X_train = token.texts_to_sequences(X_train)
X_test = token.texts_to_sequences(X_test)

padding

In [45]:
MAX_SEQUENCE_LENGTH = 30
X_train = pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH, padding="post")
X_test = pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH, padding="post")
X_train.shape, X_test.shape

((2453, 30), (273, 30))

In [46]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
num_classes=2 # positive -> 1, negative -> 0
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [47]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout,MaxPooling1D


vec_size = 300
model = Sequential()
model.add(Embedding(len(token.index_word) + 1, vec_size, input_length=30))
model.add(Conv1D(64,8, activation="relu"))
model.add(MaxPooling1D(2))
model.add(Dropout(0.1))

model.add(Dense(8, activation="relu"))
model.add(Dropout(0.1))
model.add(Dense(4, activation='relu'))
model.add(Dropout(0.1))
model.add(GlobalMaxPooling1D())
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer=tf.optimizers.Adam(learning_rate=0.0001), metrics=['accuracy'])
model.summary()


Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 30, 300)           1697100   
                                                                 
 conv1d_8 (Conv1D)           (None, 23, 64)            153664    
                                                                 
 max_pooling1d_3 (MaxPooling  (None, 11, 64)           0         
 1D)                                                             
                                                                 
 dropout_19 (Dropout)        (None, 11, 64)            0         
                                                                 
 dense_19 (Dense)            (None, 11, 8)             520       
                                                                 
 dropout_20 (Dropout)        (None, 11, 8)             0         
                                                      

In [50]:
epochs = 100
batch_size = 4

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
mc = ModelCheckpoint('./best_model/best_model_cnn1d.h5', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)
history = model.fit(X_train, y_train,  batch_size=batch_size, shuffle=True, validation_split=0.1, epochs=epochs, verbose=1, callbacks=[es, mc])

Epoch 1/100
Epoch 1: val_accuracy improved from -inf to 0.81301, saving model to ./best_model/best_model_cnn1d.h5
Epoch 2/100
Epoch 2: val_accuracy improved from 0.81301 to 0.89431, saving model to ./best_model/best_model_cnn1d.h5
Epoch 3/100
Epoch 3: val_accuracy improved from 0.89431 to 0.91870, saving model to ./best_model/best_model_cnn1d.h5
Epoch 4/100
Epoch 4: val_accuracy did not improve from 0.91870
Epoch 5/100
Epoch 5: val_accuracy did not improve from 0.91870
Epoch 6/100
Epoch 6: val_accuracy did not improve from 0.91870
Epoch 7/100
Epoch 7: val_accuracy improved from 0.91870 to 0.92683, saving model to ./best_model/best_model_cnn1d.h5
Epoch 8/100
Epoch 8: val_accuracy did not improve from 0.92683
Epoch 9/100
Epoch 9: val_accuracy did not improve from 0.92683
Epoch 10/100
Epoch 10: val_accuracy did not improve from 0.92683
Epoch 11/100
Epoch 11: val_accuracy did not improve from 0.92683
Epoch 11: early stopping


In [51]:
def predictions(x):
    prediction_probs = model.predict(x)
    predictions = [1 if prob > 0.5 else 0 for prob in prediction_probs]
    return predictions

In [57]:
test = pd.DataFrame(X_test).iloc[0]

In [64]:
predictions = model.predict(X_test)



In [65]:
predictions = predictions.flatten() 

In [66]:
reverse_word_index = {v: k for k, v in token.word_index.items()}

def sequences_to_texts(sequences):
    texts = []
    for sequence in sequences:
        texts.append(' '.join([reverse_word_index.get(i, '?') for i in sequence]))
    return texts

test_phrases = sequences_to_texts(X_test)

In [67]:
results_df = pd.DataFrame({
    'Phrase': test_phrases,
    'Actual Sentiment': y_test.flatten(),
    'Predicted Probability': predictions
})

print(results_df.head(10))

                                              Phrase  Actual Sentiment  \
0  two companies also partner developing lowering...                 1   
1  three year turnaround program expected ensure ...                 1   
2  finnish plumbing heating systems supplier upon...                 0   
3  finnish meat company atria longer promise suff...                 0   
4  driver left car suspect kidnapped forced gunpo...                 0   
5  third original participants dropped due nausea...                 0   
6  finland snow storms brought trees power lines ...                 0   
7  finnish raisio diagnostics launching new ensur...                 1   
8  able 20 russian market advertising press purch...                 1   
9  broker initiated ag konecranes oyj buy 51 42 e...                 1   

   Predicted Probability  
0               0.997617  
1               0.999961  
2               0.004546  
3               0.000468  
4               0.002802  
5               0.00012

In [82]:
new_headlines = [
    "Stock markets rally as economy shows signs of recovery",
    "Severe weather warnings issued across the country",
    "Tech companies report record earnings this quarter"
]

# Tokenize and pad the new headlines
sequences = token.texts_to_sequences(new_headlines)
padded_sequences = pad_sequences(sequences, maxlen=30, padding='post', truncating='post')

# Make predictions
new_predictions = model.predict(padded_sequences)
new_predictions = new_predictions.flatten()  # Flatten if needed

# Convert probabilities to class labels if needed (for visualization)
predicted_classes = (new_predictions > 0.5).astype("int32")
predicted_classes = np.where(predicted_classes == 0, -1, 1)

# Create DataFrame
new_results_df = pd.DataFrame({
    'Headline': new_headlines,
    'Predicted Probability': new_predictions,
    'Predicted Sentiment': predicted_classes
})

# Show the DataFrame
print(new_results_df)

                                            Headline  Predicted Probability  \
0  Stock markets rally as economy shows signs of ...               0.998472   
2  Tech companies report record earnings this qua...               0.989551   

   Predicted Sentiment  
0                    1  
1                   -1  
2                    1  
