In [4]:
pip uninstall charset_normalizer

^C
Note: you may need to restart the kernel to use updated packages.


In [5]:
pip install charset_normalizer

Note: you may need to restart the kernel to use updated packages.




In [2]:
import pandas as pd
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Bidirectional

# Load the data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Tokenize the text data
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_df['text'])
train_sequences = tokenizer.texts_to_sequences(train_df['text'])
test_sequences = tokenizer.texts_to_sequences(test_df['text'])

# Pad the sequences to a fixed length
max_length = 200
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')

# Define the RNN model
def create_model(optimizer='adam', units=64, dropout=0.2, recurrent_dropout=0.2):
    model = Sequential()
    model.add(Embedding(input_dim=5000, output_dim=128, input_length=max_length))
    model.add(Bidirectional(LSTM(units=units, dropout=dropout, recurrent_dropout=recurrent_dropout)))
    model.add(Dense(units=8, activation='softmax'))
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Wrap the Keras model with Scikit-learn interface for Grid Search
model = KerasClassifier(build_fn=create_model, epochs=10, batch_size=64, verbose=0)

# Define the hyperparameters to search over
param_grid = {
    'optimizer': ['adam', 'rmsprop'],
    'units': [32, 64, 128],
    'dropout': [0.2, 0.4, 0.6],
    'recurrent_dropout': [0.2, 0.4, 0.6]
}

# Perform Grid Search using cross-validation
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
grid_result = grid.fit(train_padded, train_df['label'])

# Print the best results and save the model
print(f'Best: {grid_result.best_score_} using {grid_result.best_params_}')
best_model = grid_result.best_estimator_.model
best_model.save('best_rnn_model.h5')

# Make predictions on the test data and save them to a CSV file
test_pred = best_model.predict_classes(test_padded)
test_df['label'] = test_pred
test_df[['id', 'label']].to_csv('submission.csv', index=False)

AttributeError: partially initialized module 'charset_normalizer' has no attribute 'md__mypyc' (most likely due to a circular import)

In [None]:
import pandas as pd
import tensorflow as tf
from keras.utils import multi_gpu_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Bidirectional

# Set the GPU configuration
gpus = tf.config.list_physical_devices('GPU')
if gpus:
  try:
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    print('GPU(s) are available')
  except RuntimeError as e:
    print(e)
else:
  print('No GPU(s) detected')

# Load the data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Tokenize the text data
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_df['text'])
train_sequences = tokenizer.texts_to_sequences(train_df['text'])
test_sequences = tokenizer.texts_to_sequences(test_df['text'])

# Pad the sequences to a fixed length
max_length = 200
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')

# Define the RNN model
def create_model(units=64, dropout=0.2, recurrent_dropout=0.2):
    model = Sequential()
    model.add(Embedding(input_dim=5000, output_dim=128, input_length=max_length))
    model.add(Bidirectional(LSTM(units=units, dropout=dropout, recurrent_dropout=recurrent_dropout)))
    model.add(Dense(units=8, activation='softmax'))
    model = multi_gpu_model(model, gpus=len(gpus))  # Use multi-GPU model
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Train the RNN model
batch_size = 64 * len(gpus)  # Increase the batch size for multi-GPU training
model = create_model()
model.fit(train_padded, train_df['label'], epochs=10, batch_size=batch_size, validation_split=0.2)

# Make predictions on the test data and save them to a CSV file
test_pred = model.predict_classes(test_padded)
test_df['label'] = test_pred
test_df[['id', 'label']].to_csv('submission.csv', index=False)
