In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from sklearn.model_selection import train_test_split
from tensorflow.python.layers.core import Dropout

In [2]:
data=pd.read_csv('cleaned_text.csv')

In [3]:
data.head()

Unnamed: 0,Text,Label
0,feel really helpless heavy hearted,4
1,ive enjoyed able slouch relax unwind frankly n...,0
2,gave internship dmrg feeling distraught,4
3,dont know feel lost,0
4,kindergarten teacher thoroughly weary job take...,4


In [18]:
X=data['Text']
y=data['Label']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
print("X_train Shape: ",X_train.shape)
print("X_test Shape: ",X_test.shape)
print("y_train Shape: ",y_train.shape)
print("y_test Shape: ",y_test.shape)

X_train Shape:  (333447,)
X_test Shape:  (83362,)
y_train Shape:  (333447,)
y_test Shape:  (83362,)


In [19]:
X_train = X_train.astype(str)
X_test = X_test.astype(str)

In [41]:
tokenizer=Tokenizer(num_words=50000,oov_token='OOV')
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

In [43]:
max_len = max(len(tokens) for tokens in X_train_sequences)
print("Maximum sequence length (max_len):", max_len)

Maximum sequence length (max_len): 79


In [44]:
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_len, padding='post',)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_len, padding='post')

In [45]:
print("X_train_padded:")
print(X_train_padded.shape)
print("\nX_test_padded:")
print(X_test_padded.shape)

X_train_padded:
(333447, 79)

X_test_padded:
(83362, 79)


In [31]:
X_train_padded = np.expand_dims(X_train_padded, axis=-1)
X_test_padded = np.expand_dims(X_test_padded, axis=-1)

In [46]:
y_train=np.array(y_train)
y_test=np.array(y_test)

In [47]:
model = Sequential([
    Embedding(50000, 50, input_length=max_len),
    Conv1D(64, 3, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(32, activation='relu'),
    Dense(6, activation='softmax')
])
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])

In [48]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 79, 50)            2500000   
                                                                 
 conv1d_5 (Conv1D)           (None, 77, 64)            9664      
                                                                 
 global_max_pooling1d_5 (Glo  (None, 64)               0         
 balMaxPooling1D)                                                
                                                                 
 dense_9 (Dense)             (None, 32)                2080      
                                                                 
 dense_10 (Dense)            (None, 6)                 198       
                                                                 
Total params: 2,511,942
Trainable params: 2,511,942
Non-trainable params: 0
____________________________________________

In [49]:
model.fit(X_train_padded, y_train, epochs=10, batch_size=32, validation_data=(X_test_padded, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1e382cbfd90>

In [50]:
model.save('my_text_cnn_model.h5')  # Saves architecture, weights, and optimizer state
print("Model saved as 'my_text_cnn_model.h5'")

Model saved as 'my_text_cnn_model.h5'


In [51]:
import pickle

# Save the tokenizer
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)
print("Tokenizer saved as 'tokenizer.pkl'")

Tokenizer saved as 'tokenizer.pkl'
