In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [2]:
data = pd.read_csv('nowedane.csv', encoding='latin-1')
data = data.iloc[:, [0, -1]]
data = data.rename(columns={data.columns[0]: 'rating', data.columns[-1]: 'text'})
data.to_csv('nowedane.csv', index=False)

In [8]:
def preprocess_rating(rating):
    if rating == 0:
        return 'negative'
    elif rating == 4:
        return 'positive'
    else:
        return 'neutral'

In [3]:
all_data = pd.read_csv('./nowedane.csv')


In [4]:
# Split the combined data into training, validation, and test sets
train_set, temp_set = train_test_split(all_data, test_size=0.5, random_state=22)
valid_set, test_set = train_test_split(temp_set, test_size=0.5, random_state=22)

In [5]:
# Reset the indices of the sets
train_set.reset_index(drop=True, inplace=True)
valid_set.reset_index(drop=True, inplace=True)
test_set.reset_index(drop=True, inplace=True)

In [9]:
# Apply the preprocess_rating function to the 'rating' column
train_set['sentiment'] = train_set['rating'].apply(preprocess_rating)
valid_set['sentiment'] = valid_set['rating'].apply(preprocess_rating)
test_set['sentiment'] = test_set['rating'].apply(preprocess_rating)

In [10]:
# Drop rows with missing 'text' values
train_set = train_set.dropna(subset=['text'])
valid_set = valid_set.dropna(subset=['text'])
test_set = test_set.dropna(subset=['text'])

In [11]:
# Create the tokenizer and fit on the training data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_set['text'])

# Determine the vocabulary size
vocab_size = len(tokenizer.word_index) + 1

In [12]:
train_sequences = tokenizer.texts_to_sequences(train_set['text'])
valid_sequences = tokenizer.texts_to_sequences(valid_set['text'])
test_sequences = tokenizer.texts_to_sequences(test_set['text'])

In [14]:
max_length = 400  # maximum sequence length

train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post')
valid_padded = pad_sequences(valid_sequences, maxlen=max_length, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post')

In [15]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 32, input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [20]:
sentiment_labels = ['negative', 'neutral', 'positive']

train_labels = tf.keras.utils.to_categorical(
    train_set['sentiment'].map(sentiment_labels.index), num_classes=3)
valid_labels = tf.keras.utils.to_categorical(
    valid_set['sentiment'].map(sentiment_labels.index), num_classes=3)
test_labels = tf.keras.utils.to_categorical(
    test_set['sentiment'].map(sentiment_labels.index), num_classes=3)

history = model.fit(train_padded, train_labels, epochs=1, validation_data=(valid_padded, valid_labels))



In [None]:
training_loss = history.history['loss']
training_accuracy = history.history['accuracy']
validation_loss = history.history['val_loss']
validation_accuracy = history.history['val_accuracy']
model.save('newmodel.h5')

In [17]:
def classify_sentiment(text):
    sequence = tokenizer.texts_to_sequences([text])
    sequence_padded = pad_sequences(sequence, maxlen=max_length, padding='post')
    prediction = model.predict(sequence_padded)
    sentiment = sentiment_labels[prediction.argmax()]
    return sentiment


In [27]:
input_text = "Shame on you"
classification = classify_sentiment(input_text)
print("Sentiment:", classification)

Sentiment: negative


In [13]:
import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)