In [None]:
!pip install gensim

In [None]:
! pip install tensorflow

In [1]:
import nltk
import re
import string
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from gensim.models import KeyedVectors


# **Split the data into Traine - Test set**

In [2]:
# 1. Load the Text File
file_path = 'data.csv'
with open(file_path, 'r', encoding='utf-8') as file:
    news = pd.read_csv(file_path)

In [None]:
from sklearn.model_selection import train_test_split

# Separate the features and the target variable
X = news['title']
y = news['label']               

# Split the data into training and testing sets (80% train, 20% test)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# **1. Preprocess Your Data**

**Tokenization & Padding:**

In [25]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# 1. a. Tokenize the train text
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(x_train)
sequences_train = tokenizer.texts_to_sequences(x_train)

# 1. b. Tokenize the train text
sequences_test = tokenizer.texts_to_sequences(x_test)

# 2. a. Compute the average train sequence length  
train_sequence_lengths = [len(seq) for seq in sequences_train]
avg_length_train = int(np.mean(train_sequence_lengths))
print("Average sequence length:", avg_length_train)

# 2. b. Compute the average test sequence length
test_sequence_lengths = [len(seq) for seq in sequences_test]
avg_length_test = int(np.mean(test_sequence_lengths))
print("Average sequence length:", avg_length_test)

# 3. Use the average length as the max_sequence_length for padding/truncation
max_sequence_length = avg_length_train
x_train = pad_sequences(sequences_train, maxlen=max_sequence_length)

max_sequence_length = avg_length_test
x_test = pad_sequences(sequences_test, maxlen=max_sequence_length)
word_index = tokenizer.word_index


Average sequence length: 392
Average sequence length: 389


**load word2vec modle**

In [5]:
path = r"../models/GoogleNews-vectors-negative300.bin.gz"
word2vec_model = KeyedVectors.load_word2vec_format(path, binary=True)

# **2. Prepare the Embedding Layer**

**Embedding Matrix from Pre-trained Word2Vec:**


In [26]:
import numpy as np

embedding_dim = 300  # For example, if you're using Google News word2vec
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))

for word, i in word_index.items():
    if word in word2vec_model:
        embedding_matrix[i] = word2vec_model[word]


**Create an Embedding layer in Keras**

In [27]:
from tensorflow.keras.layers import Embedding

embedding_layer = Embedding(
    input_dim=len(word_index) + 1,
    output_dim=embedding_dim,
    weights=[embedding_matrix],
    input_length=max_sequence_length,
    trainable=False  # Use False to keep the embeddings fixed
)




# **3. Build the Kim-CNN Architecture**

**Architecture Components:**

**1. Input Layer:** Accepts the padded sequences.

**2. Embedding Layer:** Converts word indices to word vectors.

**3. Convolutional Layers:** Apply several 1D convolution filters with different kernel sizes (e.g., 3, 4, 5) to capture various n-gram features.

**4. Global Max-Pooling:** For each filter, apply max pooling over the time dimension (i.e., across the sentence length) to capture the most significant feature.

**5. Concatenation:** Merge the outputs of the different filters.

**6. Dropout:** Apply dropout for regularization.

**7. Dense Layer:** Final classification layer with a softmax (or sigmoid) activation for prediction.

In [28]:
from keras.models import Model
from keras.layers import Input, Conv1D, GlobalMaxPooling1D, Concatenate, Dropout, Dense

# Define hyperparameters
filter_sizes = [3, 4, 5]   # Different filter sizes for n-grams
num_filters = 128          # Number of filters per size
dropout_rate = 0.5
num_classes = 2            # Adjust based on your classification task

# Input layer
sequence_input = Input(shape=(max_sequence_length,), dtype='int32')

# Embedding layer (using the pre-trained word2vec embeddings)
embedded_sequences = embedding_layer(sequence_input)

# Create a convolution + pooling layer for each filter size
conv_layers = []
for filter_size in filter_sizes:
    conv = Conv1D(
        filters=num_filters,
        kernel_size=filter_size,
        activation='relu'
    )(embedded_sequences)
    
    pool = GlobalMaxPooling1D()(conv)
    conv_layers.append(pool)

# Concatenate the pooled features from each filter
if len(conv_layers) > 1:
    merged = Concatenate()(conv_layers)
else:
    merged = conv_layers[0]

# Apply dropout for regularization
drop = Dropout(dropout_rate)(merged)

# Final dense layer for classification
preds = Dense(num_classes, activation='softmax')(drop)

# Define the model
model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


In [30]:
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(31953, 392) (31953, 2, 2)
(7989, 389) (7989, 2, 2)


In [None]:
import pickle
with open('tokenizer_text.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

**Training and Evaluating the Model**

Convert your labels to categorical format and train the model.

In [31]:
from tensorflow.keras.utils import to_categorical

# Convert binary labels (0 or 1) to one-hot encoded vectors (shape becomes (num_samples, 2))
y_train = to_categorical(y_train, num_classes=2)
y_test = to_categorical(y_test, num_classes=2)

In [33]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# ModelCheckpoint to save the best model based on validation loss
checkpoint = ModelCheckpoint(
    filepath='kim_model.h5',      # Filepath where the model will be saved
    monitor='val_loss',            # Metric to monitor
    save_best_only=True,           # Only save the model if val_loss improves
    verbose=1                     # Print messages when the model is saved
)

# Train the model with the callbacks
model.fit(
    x_train, y_train,
    validation_data=(x_test, y_test),
    epochs=10,
    batch_size=64,
    callbacks=[early_stopping, checkpoint]
)

Epoch 1/10


ValueError: Input 0 of layer "functional_2" is incompatible with the layer: expected shape=(None, 389), found shape=(None, 392)

In [None]:
loss, accuracy = model.evaluate(x_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)


In [None]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

# Generate predicted probabilities
y_pred_prob = model.predict(x_test)
# Convert predicted probabilities to class labels
y_pred = np.argmax(y_pred_prob, axis=1)
# Convert one-hot encoded y_test back to class labels
y_true = np.argmax(y_test, axis=1)

# Print classification report and confusion matrix
print(classification_report(y_true, y_pred))
print(confusion_matrix(y_true, y_pred))
