In [72]:
import pandas as pd
import numpy as np

In [13]:
df = pd.read_csv('/content/all-data.csv', encoding='latin-1', header=None)
df.columns = ['Sentiment', 'Text']
df.head()

Unnamed: 0,Sentiment,Text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


Data Preprocessing


In [14]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [15]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
def preprocess (text):
  text = text.lower()
  text = re.sub(r'[^\w\s]', '', text)
  tokens = word_tokenize(text)
  tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
  return ' '.join(tokens)

df['Text'] = df['Text'].apply(preprocess)
df.head()

Unnamed: 0,Sentiment,Text
0,neutral,according gran company plan move production ru...
1,neutral,technopolis plan develop stage area le 100000 ...
2,negative,international electronic industry company elco...
3,positive,new production plant company would increase ca...
4,positive,according company updated strategy year 200920...


Feature Extraction


In [16]:
# convert into numeric
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['Text'])
y = df['Sentiment']

In [80]:
from sklearn.preprocessing import LabelEncoder
import pickle
le = LabelEncoder()
df['labelled_y'] = le.fit_transform(y)
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f)


In [28]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Model Fitting

In [77]:

from sklearn.metrics import classification_report, confusion_matrix

Fit LSTM


In [65]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [81]:
# prepare data fro LSTM
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['Text'])
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)
sequences = tokenizer.texts_to_sequences(df['Text'])

In [61]:
max_length = max([len(seq) for seq in sequences])
X = pad_sequences(sequences, maxlen=max_length)
y = df['labelled_y'].values


array([1, 1, 0, ..., 0, 0, 0])

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [68]:
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=max_length))
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(32))
model.add(Dropout(0.5))
model.add(Dense(len(le.classes_), activation='softmax'))  # Adjust for the number of classes

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])




In [69]:
history = model.fit(X_train, y_train, epochs=10, batch_size=8, validation_data=(X_test, y_test))


Epoch 1/10
[1m485/485[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 80ms/step - accuracy: 0.6099 - loss: 0.9059 - val_accuracy: 0.7093 - val_loss: 0.6893
Epoch 2/10
[1m485/485[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 70ms/step - accuracy: 0.8247 - loss: 0.4781 - val_accuracy: 0.7330 - val_loss: 0.6490
Epoch 3/10
[1m485/485[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 69ms/step - accuracy: 0.9357 - loss: 0.2014 - val_accuracy: 0.7278 - val_loss: 0.9733
Epoch 4/10
[1m485/485[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 68ms/step - accuracy: 0.9739 - loss: 0.0928 - val_accuracy: 0.7320 - val_loss: 1.0869
Epoch 5/10
[1m485/485[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 70ms/step - accuracy: 0.9812 - loss: 0.0665 - val_accuracy: 0.7289 - val_loss: 1.3183
Epoch 6/10
[1m485/485[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 68ms/step - accuracy: 0.9914 - loss: 0.0373 - val_accuracy: 0.7072 - val_loss: 1.4269
Epoch 7/10
[1m4

In [70]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy:.2f}')


[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 45ms/step - accuracy: 0.7359 - loss: 1.7555
Test Accuracy: 0.72


In [74]:
predictions = model.predict(X_test)
predicted_classes = np.argmax(predictions, axis=1)

# Print out predictions
#for i in range(len(X_test)):
#    print(f"Text: {df['Text'].iloc[i]} - Predicted Label: {le.classes_[predicted_classes[i]]}")


[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 35ms/step
Text: according gran company plan move production russia although company growing - Predicted Label: 1
Text: technopolis plan develop stage area le 100000 square meter order host company working computer technology telecommunication statement said - Predicted Label: 1
Text: international electronic industry company elcoteq laid ten employee tallinn facility contrary earlier layoff company contracted rank office worker daily postimees reported - Predicted Label: 2
Text: new production plant company would increase capacity meet expected increase demand would improve use raw material therefore increase production profitability - Predicted Label: 1
Text: according company updated strategy year 20092012 basware target longterm net sale growth range 20 40 operating profit margin 10 20 net sale - Predicted Label: 1
Text: financing aspocomp growth aspocomp aggressively pursuing growth strategy increasingly focusing techn

In [78]:
model.save('sentiment_model.h5')

