In [1]:
import pandas as pd
import nltk
nltk.download('all')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, cohen_kappa_score,precision_score,f1_score

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    |   Package bcp47 is already up-to-dat

In [2]:
from google.colab import drive
drive.mount('/gdrive')
df = pd.read_csv('/gdrive/MyDrive/Sentiment_Dataset.csv')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2748 entries, 0 to 2747
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Sentence   2748 non-null   object
 1   Sentiment  2748 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 43.1+ KB


In [5]:
def preprocess_text(text):
    # Convertir el texto a minúscula
    text = text.lower()

    # Tokenizar el texto
    tokens = word_tokenize(text)

    # Eliminar las stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Lematizar los tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Unir los tokens para obtener el texto procesado
    processed_text = ' '.join(tokens)

    return processed_text

In [7]:
df['Sentence']=df['Sentence'].astype('string')

In [10]:
dfc=df.copy()

In [11]:
dfc['Sentence']=df['Sentence'].apply(preprocess_text)

In [12]:
dfc.head()

Unnamed: 0,Sentence,Sentiment
0,wow ... loved place .,1
1,crust good .,0
2,tasty texture nasty .,0
3,stopped late may bank holiday rick steve recom...,1
4,selection menu great price .,1


In [14]:
from sklearn.preprocessing import LabelEncoder

# Codificación de etiquetas
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(df['Sentiment'])

# Dividir en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(dfc['Sentence'], y_encoded, test_size=0.2, random_state=42)

print(f"Training size: {len(X_train)}")
print(f"Test size: {len(X_test)}")

Training size: 2198
Test size: 550


In [15]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Definir el tokenizador
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

# Convertir texto a secuencias de números
train_sequences = tokenizer.texts_to_sequences(X_train)
test_sequences = tokenizer.texts_to_sequences(X_test)

# Hacer padding a las secuencias
maxlen = 100
train_padded = pad_sequences(train_sequences, maxlen=maxlen, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=maxlen, padding='post', truncating='post')


In [16]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Definir el modelo
model = Sequential()

# Capa de Embedding
model.add(Embedding(input_dim=10000,
                    output_dim=128,
                    input_length=100))

# Capa LSTM
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.2))

# Segunda capa LSTM
model.add(LSTM(32))
model.add(Dropout(0.2))

# Capa Densa de Salida
model.add(Dense(1, activation='sigmoid'))

# Compilar el modelo
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Resumen del modelo
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 128)          1280000   
                                                                 
 lstm (LSTM)                 (None, 100, 64)           49408     
                                                                 
 dropout (Dropout)           (None, 100, 64)           0         
                                                                 
 lstm_1 (LSTM)               (None, 32)                12416     
                                                                 
 dropout_1 (Dropout)         (None, 32)                0         
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 1341857 (5.12 MB)
Trainable params: 134185

In [17]:
#DummyClassifier
from sklearn.dummy import DummyClassifier

In [18]:
classifier = DummyClassifier(strategy="most_frequent")

In [19]:
classifier.fit(train_sequences, y_train)

In [20]:
predictions = classifier.predict(test_sequences)

In [21]:
print("Accuracy=",accuracy_score(y_test,predictions))
print("Kappa=",cohen_kappa_score(y_test,predictions))
print("Precision=",precision_score(y_test, predictions,average='micro'))
print("F1-score=",f1_score(y_test,predictions, average='micro'))

Accuracy= 0.4818181818181818
Kappa= 0.0
Precision= 0.4818181818181818
F1-score= 0.4818181818181818
