In [4]:
import pandas as pd
filepath_dict={
    'yelp':'yelp_labelled.txt',
    'amazon':'amazon_cells_labelled.txt',
    'imdb':'imdb_labelled.txt'
}

In [10]:
df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')  
    df['source'] = source  
    df_list.append(df)  

df = pd.concat(df_list, ignore_index=True)
df

Unnamed: 0,sentence,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp
...,...,...,...
2743,I just got bored watching Jessice Lange take h...,0,imdb
2744,"Unfortunately, any virtue in this film's produ...",0,imdb
2745,"In a word, it is embarrassing.",0,imdb
2746,Exceptionally bad!,0,imdb


In [11]:
from sklearn.model_selection import train_test_split
from tensorflow.python import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Bidirectional,Dense

In [12]:
X = df['sentence'].values
y = df['label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
tokenizer=Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

In [25]:
X_train_seq=tokenizer.texts_to_sequences(X_train)
X_test_seq=tokenizer.texts_to_sequences(X_test)

In [26]:
max_length = max(len(seq) for seq in X_train_seq)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

In [27]:
vocab_size=len(tokenizer.word_index )+1
embedding_dim=128

model=Sequential()
model.add(Embedding(input_dim=vocab_size,output_dim=embedding_dim,input_length=max_length))
model.add(Bidirectional(LSTM(64,return_sequences=True)))
model.add(Bidirectional(LSTM(32)))
model.add(Dense(32,activation='relu'))
model.add(Dense(1,activation='sigmoid'))
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [29]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1400, 128)         602624    
                                                                 
 bidirectional (Bidirection  (None, 1400, 128)         98816     
 al)                                                             
                                                                 
 bidirectional_1 (Bidirecti  (None, 64)                41216     
 onal)                                                           
                                                                 
 dense (Dense)               (None, 32)                2080      
                                                                 
 dense_1 (Dense)             (None, 1)                 33        
                                                                 
Total params: 744769 (2.84 MB)
Trainable params: 744769 

In [30]:
model.fit(X_train_pad,y_train,epochs=3,batch_size=32,validation_data=(X_test_pad, y_test))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x28918070190>

In [31]:
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f'Loss: {loss:.4f}, Accuracy: {accuracy:.4f}')

Loss: 0.5628, Accuracy: 0.8127


In [41]:
def predict_sentiment(text, tokenizer, model, max_length):
    seq = tokenizer.texts_to_sequences([text])
    seq_pad = pad_sequences(seq, maxlen=max_length, padding='post')
    prediction = model.predict(seq_pad)[0][0]
    sentiment = "pos 😀" if prediction >= 0.5 else "neg 😞"
    
    return sentiment, prediction


text_input = "my name is moaz and i gonna kill u"
sentiment, score = predict_sentiment(text_input, tokenizer, model, max_length)

print(f"التصنيف: {sentiment} (الدرجة: {score:.4f})")

التصنيف: neg 😞 (الدرجة: 0.0266)
