In [2]:
pip install pandas scikit-learn tensorflow nltk transformers


/bin/bash: /home/hamza/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m309.6 kB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m


Installing collected packages: nltk
Successfully installed nltk-3.8.1
Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (
    Embedding, 
    Conv1D, 
    GlobalMaxPooling1D, 
    Dense, 
    Dropout, 
    SpatialDropout1D, 
    GRU, 
    LSTM, 
    Bidirectional, 
    SimpleRNN, 
    MaxPooling1D, 
    Flatten, 
    BatchNormalization, 
    SeparableConv1D
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


In [5]:


# Load the data

df_train = pd.read_csv('https://raw.githubusercontent.com/H-GRF/data/main/train.csv')

mode_state = df_train.groupby('blurb')['state'].apply(lambda x: x.mode().iloc[0]).reset_index()

df_train = pd.merge(df_train, mode_state, on='blurb', suffixes=('', '_mode'))
df_train = df_train.drop('state', axis=1)
df_train = df_train.rename(columns={'state_mode': 'state'})
train_data = df_train.drop_duplicates(subset="blurb").reset_index(drop=True)
train_data

test_data = pd.read_csv('https://raw.githubusercontent.com/H-GRF/data/main/test.csv')

# Fill NaN values with mode
train_data['blurb'].fillna(train_data['blurb'].mode()[0], inplace=True)
test_data['blurb'].fillna(test_data['blurb'].mode()[0], inplace=True)

# Preprocess the data
X_train = train_data['blurb']
y_train = train_data['state']
X_test = test_data['blurb']

# Convert labels to binary values
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)


# Tokenize and pad sequences for training data

max_words = 10000
max_len = 100
embedding_dim = 300  

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

# Tokenize and pad sequences for train data

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)

# Tokenize and pad sequences for test data
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

# Build the CNN model 
model = Sequential(name="Text_Classification_Model")

# Embedding layer 
model.add(Embedding(input_dim=max_words, output_dim=300, input_length=max_len, name="embedding_layer"))

# Spatial dropout  Batch normalization
model.add(SpatialDropout1D(0.2, name="spatial_dropout"))
model.add(BatchNormalization(name="batch_norm_1"))  

# Convolutional layers 
model.add(Conv1D(128, 3, activation='relu',  name="conv1d_1"))
model.add(BatchNormalization(name="batch_norm_2"))  
model.add(Conv1D(128, 5, activation='relu',  name="conv1d_2"))
model.add(BatchNormalization(name="batch_norm_3")) 
model.add(Conv1D(64, 7, activation='relu',  name="conv1d_3"))
model.add(MaxPooling1D(2, name="max_pooling")) 
model.add(SeparableConv1D(64, 3, activation='relu', name="separable_conv1d")) 

# Recurrent layers 

model.add(Bidirectional(LSTM(64, return_sequences=True), name="bidirectional_lstm"))
model.add(LSTM(64, return_sequences=True, name="lstm"))
model.add(Bidirectional(GRU(64, return_sequences=True), name="bidirectional_gru"))
model.add(GRU(64, return_sequences=True, name="gru"))
model.add(SimpleRNN(32, return_sequences=True, name="simple_rnn"))
model.add(Dropout(0.3, name="dropout_1"))

# Dense layers for classification

model.add(Dense(128, activation='relu', name="dense_1"))
model.add(BatchNormalization(name="batch_norm_4")) 
model.add(Dropout(0.5, name="dropout_2"))
model.add(Dense(64, activation='relu', name="dense_2"))  
model.add(Dropout(0.3, name="dropout_3"))

# Global pooling for extracting key features
model.add(GlobalMaxPooling1D(name="global_max_pooling"))


# Flatten for dense layers
model.add(Flatten(name="flatten"))

# Dense layer for final classification
model.add(Dense(1, activation='sigmoid', name="output_layer"))

# Compile the model 
optimizer = Adam(learning_rate=0.0001)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy', 'AUC'])

model.summary()
# Implement k-fold cross-validation
num_folds = 10
kfold = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train_pad, y_train), 1):
    X_train_fold, X_val_fold = X_train_pad[train_idx], X_train_pad[val_idx]
    y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]

    # Add Early Stopping callback
    early_stopping = EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True)

    # Train the model on the current fold
    model.fit(X_train_fold, y_train_fold, epochs=15, batch_size=512, validation_data=(X_val_fold, y_val_fold), callbacks=[early_stopping])

# Predict on the test set
predictions = model.predict(X_test_pad)


# Create a DataFrame with the 'id' and 'state' columns for submission
submission_df = pd.DataFrame({'id': test_data['id'], 'state': predictions.flatten()})

# Save the submission file
submission_df.to_csv('/kaggle/working/SUB.csv', index=False)



KeyboardInterrupt: 