In [85]:
#!pip install scikeras
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score,cohen_kappa_score, precision_score, f1_score,recall_score,make_scorer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from scikeras.wrappers import KerasClassifier
from tensorflow.keras.layers import SimpleRNN, Dense, Embedding, Dropout,LSTM
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder

In [6]:
folder_path = ""
file_names = ['amazon_cells_labelled.txt', 'imdb_labelled.txt', 'yelp_labelled.txt']

# Load data from multiple files
dfs = []
for file_name in file_names:
    file_path = f"{file_name}"
    df = pd.read_csv(file_path, sep='\t', header=None, names=['sentence', 'label'])
    dfs.append(df)

# Concatenate data from different files
df = pd.concat(dfs, ignore_index=True)

# Task 2: Preprocess the text data
# Tokenization, lowercasing, and removing stopwords using NLTK
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    words = word_tokenize(text.lower())
    filtered_words = [word for word in words if word.isalpha() and word not in stop_words]
    return ' '.join(filtered_words)

df['processed_sentence'] = df['sentence'].apply(preprocess_text)
print(df)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                                               sentence  label  \
0     So there is no way for me to plug it in here i...      0   
1                           Good case, Excellent value.      1   
2                                Great for the jawbone.      1   
3     Tied to charger for conversations lasting more...      0   
4                                     The mic is great.      1   
...                                                 ...    ...   
2743  I think food should have flavor and texture an...      0   
2744                           Appetite instantly gone.      0   
2745  Overall I was not impressed and would not go b...      0   
2746  The whole experience was underwhelming, and I ...      0   
2747  Then, as if I hadn't wasted enough of my life ...      0   

                                     processed_sentence  
0                       way plug us unless go converter  
1                             good case excellent value  
2                                

In [49]:
X_train, X_test, y_train, y_test = train_test_split(df['processed_sentence'], df['label'], test_size=0.2, random_state=42)

dummy_clf = make_pipeline(CountVectorizer(), DummyClassifier(strategy='most_frequent'))
dummy_clf.fit(X_train, y_train)
y_pred_dummy = dummy_clf.predict(X_test)

# Evaluate DummyClassifier performance
accuracy_dummy = accuracy_score(y_test, y_pred_dummy)
precision_dummy = precision_score(y_test, y_pred_dummy)
recall_dummy = recall_score(y_test, y_pred_dummy)
f1_dummy = f1_score(y_test, y_pred_dummy)

print("DummyClassifier Performance:")
print(f"Accuracy: {accuracy_dummy}")
print(f"Precision: {precision_dummy}")
print(f"Recall: {recall_dummy}")
print(f"F1-score: {f1_dummy}")

DummyClassifier Performance:
Accuracy: 0.4709090909090909
Precision: 0.4709090909090909
Recall: 1.0
F1-score: 0.6402966625463535
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

In [81]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

X_train, X_test, y_train, y_test = train_test_split(df['processed_sentence'], df['label'], test_size=0.2, random_state=42)


max_words = 10000
max_len = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)


label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)


model = Sequential()
model.add(Embedding(max_words, 128, input_length=max_len))
model.add(SimpleRNN(64))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))


model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])



history = model.fit(X_train_pad, y_train_encoded, validation_split=0.2,
                    epochs=25, batch_size=128)

y_pred = model.predict(X_test_pad)
y_pred_binary = (y_pred > 0.5).astype(int)


accuracy_rnn = accuracy_score(y_test_encoded, y_pred_binary)
precision_rnn = precision_score(y_test_encoded, y_pred_binary)
recall_rnn = recall_score(y_test_encoded, y_pred_binary)
f1_rnn = f1_score(y_test_encoded, y_pred_binary)

print("RNN Performance:")
print(f"Accuracy: {accuracy_rnn}")
print(f"Precision: {precision_rnn}")
print(f"Recall: {recall_rnn}")
print(f"F1-score: {f1_rnn}")


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
RNN Performance:
Accuracy: 0.7054545454545454
Precision: 0.6816479400749064
Recall: 0.7027027027027027
F1-score: 0.6920152091254753


In [75]:
rnn_classifier = KerasClassifier(build_fn=model, epochs=25, batch_size=128, verbose=0)

# Definir los parámetros a buscar
param_grid = {
    'optimizer': ["adam","sgd" ,"rmsprop" ],
    'loss': ["binary_crossentropy"],
    'activation': ["tanh", "relu","sigmoid"],
    "layers":[[20],[40,20], [45, 30, 15]],
}

# Crear el objeto GridSearchCV
grid = GridSearchCV(estimator=rnn_classifier, param_grid=param_grid, scoring='accuracy', cv=3)

# Preprocesar los datos nuevamente para asegurarse de que todo está en orden
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

y_train_encoded = label_encoder.fit_transform(y_train)

# Ejecutar la búsqueda de hiperparámetros
grid_result = grid.fit(X_train_pad, y_train_encoded)

# Imprimir los resultados
print(f"Best Accuracy: {grid_result.best_score_} using {grid_result.best_params_}")
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print(f"Accuracy: {mean} (±{stdev}) with: {param}")




AttributeError: ignored

In [78]:
print(model.summary())

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_12 (Embedding)    (None, 100, 128)          1280000   
                                                                 
 simple_rnn_12 (SimpleRNN)   (None, 64)                12352     
                                                                 
 dropout_9 (Dropout)         (None, 64)                0         
                                                                 
 dense_12 (Dense)            (None, 1)                 65        
                                                                 
Total params: 1292417 (4.93 MB)
Trainable params: 1292417 (4.93 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [82]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

X_train, X_test, y_train, y_test = train_test_split(df['processed_sentence'], df['label'], test_size=0.2, random_state=42)


max_words = 10000
max_len = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)


label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)


model = Sequential()
model.add(Embedding(max_words, 128, input_length=max_len))
model.add(SimpleRNN(64))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))


model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])



history = model.fit(X_train_pad, y_train_encoded, validation_split=0.2,
                    epochs=25, batch_size=128)

y_pred = model.predict(X_test_pad)
y_pred_binary = (y_pred > 0.5).astype(int)


accuracy_rnn = accuracy_score(y_test_encoded, y_pred_binary)
precision_rnn = precision_score(y_test_encoded, y_pred_binary)
recall_rnn = recall_score(y_test_encoded, y_pred_binary)
f1_rnn = f1_score(y_test_encoded, y_pred_binary)
kappa_rnn=cohen_kappa_score(y_test_encoded,y_pred_binary)

print("RNN Performance:")
print(f"Accuracy: {accuracy_rnn}")
print(f"Precision: {precision_rnn}")
print(f"Recall: {recall_rnn}")
print(f"F1-score: {f1_rnn}")
print(f"Kappa-score: {kappa_rnn}")


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
RNN Performance:
Accuracy: 0.7290909090909091
Precision: 0.7217741935483871
Recall: 0.6911196911196911
F1-score: 0.7061143984220907


In [87]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

X_train, X_test, y_train, y_test = train_test_split(df['processed_sentence'], df['label'], test_size=0.2, random_state=42)


max_words = 10000
max_len = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)


label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)


model_lstm  = Sequential()
model_lstm .add(Embedding(max_words, 128, input_length=max_len))
model_lstm .add(LSTM(64))
model_lstm .add(Dropout(0.5))
model_lstm .add(Dense(1, activation='softmax'))


model_lstm .compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])



history = model_lstm .fit(X_train_pad, y_train_encoded, validation_split=0.2,
                    epochs=25, batch_size=128)

y_pred = model_lstm.predict(X_test_pad)
y_pred_binary = (y_pred > 0.5).astype(int)


accuracy_lstm = accuracy_score(y_test_encoded, y_pred_binary)
precision_lstm = precision_score(y_test_encoded, y_pred_binary)
recall_lstm = recall_score(y_test_encoded, y_pred_binary)
f1_lstm = f1_score(y_test_encoded, y_pred_binary)
kappa_lstm=cohen_kappa_score(y_test_encoded,y_pred_binary)

print("LSTM Performance:")
print(f"Accuracy: {accuracy_lstm}")
print(f"Precision: {precision_lstm}")
print(f"Recall: {recall_lstm}")
print(f"F1-score: {f1_lstm}")
print(f"Kappa-score: {kappa_lstm}")


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
LSTM Performance:
Accuracy: 0.4709090909090909
Precision: 0.4709090909090909
Recall: 1.0
F1-score: 0.6402966625463535
Kappa-score: 0.0


In [88]:
lstm_classifier = KerasClassifier(build_fn=model_lstm, epochs=25, batch_size=128, verbose=0)

# Definir los parámetros a buscar
param_grid = {
    'optimizer': ["adam","sgd" ,"rmsprop" ],
    'loss': ["binary_crossentropy"],
}

# Crear el objeto GridSearchCV
grid = GridSearchCV(estimator=lstm_classifier, param_grid=param_grid, scoring='accuracy', cv=3)

# Preprocesar los datos nuevamente para asegurarse de que todo está en orden
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

y_train_encoded = label_encoder.fit_transform(y_train)

# Ejecutar la búsqueda de hiperparámetros
grid_result = grid.fit(X_train_pad, y_train_encoded)

# Imprimir los resultados
print(f"Best Accuracy: {grid_result.best_score_} using {grid_result.best_params_}")
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print(f"Accuracy: {mean} (±{stdev}) with: {param}")


  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)


Best Accuracy: 0.5127386516971201 using {'loss': 'binary_crossentropy', 'optimizer': 'adam'}
Accuracy: 0.5127386516971201 (±0.00031365116394637787) with: {'loss': 'binary_crossentropy', 'optimizer': 'adam'}
Accuracy: 0.5127386516971201 (±0.00031365116394637787) with: {'loss': 'binary_crossentropy', 'optimizer': 'sgd'}
Accuracy: 0.5127386516971201 (±0.00031365116394637787) with: {'loss': 'binary_crossentropy', 'optimizer': 'rmsprop'}
