<a href="https://colab.research.google.com/github/MadhuReddy001/Image-Captioning/blob/main/Untitled2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install tensorflow==2.8.0



In [None]:
!pip install imbalanced-learn

In [None]:
!pip install protobuf==3.20.*

In [3]:
import numpy as np
import pandas as pd
import gensim.downloader as api
import nltk
from tensorflow.keras.preprocessing.text import Tokenizer # This import should work now
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.layers import Input, Embedding, LSTM, Dropout, Dense
from tensorflow.keras.models import Model, load_model
from sklearn.model_selection import train_test_split
from nltk import word_tokenize
from imblearn.over_sampling import RandomOverSampler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import roc_auc_score
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
import pickle

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [None]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

In [None]:
url = "https://www.dropbox.com/s/AAAw1_JIzpuVvwteJCma0xMla?dl=1"
df = pd.read_csv(url, sep=',', encoding='latin-1', header=1)
df.dropna(subset=['target'], inplace=True)

In [None]:
X = df['question_text']
Y = df['target']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train.dropna().astype(int)),
    y=y_train.dropna().astype(int)
)

In [None]:
class_weight_dict = dict(enumerate(class_weights))

In [None]:
x_train = x_train.astype(str).apply(lambda x: x.lower() if isinstance(x, str) else '')
x_test = x_test.astype(str).apply(lambda x: x.lower() if isinstance(x, str) else '')

tk = Tokenizer(char_level=False, split=' ')
tk.fit_on_texts(x_train)

seq_train = tk.texts_to_sequences(x_train)
seq_test = tk.texts_to_sequences(x_test)
vocab_size = len(tk.word_index) + 1

max_len = 44
seq_train_matrix = sequence.pad_sequences(seq_train, maxlen=max_len)
seq_test_matrix = sequence.pad_sequences(seq_test, maxlen=max_len)

In [None]:
word2vec_model = api.load("word2vec-google-news-300")

In [None]:
embedding_dim = 300
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tk.word_index.items():
    if word in word2vec_model.key_to_index:
        embedding_matrix[i] = word2vec_model[word]

In [None]:
inputs = Input(name='text_input', shape=[max_len])
embed = Embedding(vocab_size, embedding_dim, input_length=max_len, mask_zero=True,
                  weights=[embedding_matrix], trainable=False)(inputs)
lstm_layer = LSTM(512)(embed)
drop1 = Dropout(0.2)(lstm_layer)
dense1 = Dense(256, activation='relu')(drop1)
drop2 = Dropout(0.2)(dense1)
dense2 = Dense(128, activation='relu')(drop2)
drop3 = Dropout(0.2)(dense2)
dense3 = Dense(25, activation='relu')(drop3)
drop4 = Dropout(0.2)(dense3)
output = Dense(1, activation='sigmoid')(drop4)

In [None]:
model = Model(inputs=inputs, outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

non_nan_indices = ~y_train.isna()
seq_train_matrix = seq_train_matrix[non_nan_indices]
y_train = y_train[non_nan_indices]

ros = RandomOverSampler(random_state=42)
seq_train_matrix_resampled, y_train_resampled = ros.fit_resample(seq_train_matrix, y_train)

In [None]:
checkpoint = ModelCheckpoint('/content/weights-{epoch:02d}-{val_loss:.4f}.keras',
                             monitor='val_loss', verbose=1, save_best_only=True)
earlystop = EarlyStopping(monitor='val_loss', patience=3, verbose=1)

model.fit(seq_train_matrix_resampled, y_train_resampled, epochs=15, batch_size=1000,
          validation_data=(seq_test_matrix, y_test), callbacks=[earlystop, checkpoint])

In [None]:
best_model = load_model('/content/weights-04-0.2007.keras')
best_model_weights = best_model.get_weights()

In [None]:
inputs = Input(name='text_input', shape=[max_len])
embed = Embedding(vocab_size, embedding_dim, input_length=max_len, mask_zero=True,
                  weights=[embedding_matrix], trainable=False)(inputs)
lstm_layer = LSTM(512)(embed)
drop1 = Dropout(0.2)(lstm_layer)
dense1 = Dense(256, activation='relu')(drop1)
drop2 = Dropout(0.2)(dense1)
dense2 = Dense(128, activation='relu')(drop2)
drop3 = Dropout(0.2)(dense2)
dense3 = Dense(25, activation='relu')(drop3)
drop4 = Dropout(0.2)(dense3)
output = Dense(1, activation='sigmoid')(drop4)

recreated_model = Model(inputs=inputs, outputs=output)
recreated_model.set_weights(best_model_weights)

p = recreated_model.predict(seq_test_matrix)
print("ROC AUC Score:", roc_auc_score(y_test, p))

In [None]:
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tk, f)
model.save('text_classification_model.keras')

In [None]:
example_question = "What is the capital of France?"

example_question = example_question.lower()

example_sequence = tk.texts_to_sequences([example_question])

example_padded = sequence.pad_sequences(example_sequence, maxlen=max_len)

prediction = recreated_model.predict(example_padded)

print("Prediction:", prediction)
print("Class:", "Positive" if prediction[0][0] > 0.5 else "Negative")


In [None]:
def predict_question(model, tokenizer, question, max_len):

    question = question.lower()
    sequence = tokenizer.texts_to_sequences([question])
    padded_sequence = sequence.pad_sequences(sequence, maxlen=max_len)

    prediction = model.predict(padded_sequence)[0][0]

    return prediction, "Positive" if prediction > 0.5 else "Negative"

example_question = "What is the capital of France?"
probability, predicted_class = predict_question(recreated_model, tk, example_question, max_len)

print(f"Prediction: {probability}")
print(f"Class: {predicted_class}")