In [2]:
from transformers import BertTokenizer
from transformers import AutoTokenizer 
from transformers import TFBertModel 
import tensorflow as tf
import pandas as pd 
import matplotlib.pyplot as plt 
from transformers import TFBertForSequenceClassification
import seaborn as sns
from sklearn.model_selection import train_test_split
import time
import numpy as np
import warnings  
warnings.filterwarnings('ignore')   

In [3]:
df =pd.read_csv(r"C:\Users\ASUS\Downloads\archive\questions.csv") 
df.head(3)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0


In [4]:
x=df['is_duplicate'].value_counts()
x

is_duplicate
0    255045
1    149306
Name: count, dtype: int64

In [5]:
downsample = x.min() 
downsample

149306

In [6]:
df1=(df.groupby(by='is_duplicate').apply(lambda x:x.sample(downsample)).reset_index(drop=True)) 
df1.head(3)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,246714,485841,485842,What is the best herbal remedy for depression ...,What are great herbal remedies?,0
1,122655,243054,243055,How many Candidates applied for IBPS PO 2016?,Should I prepare for IBPS PO 2016?,0
2,162996,322241,322242,Can one self-induce a coma?,How can I safely induce a coma upon myself?,0


In [7]:
df1.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 298612 entries, 0 to 298611
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            298612 non-null  int64 
 1   qid1          298612 non-null  int64 
 2   qid2          298612 non-null  int64 
 3   question1     298611 non-null  object
 4   question2     298610 non-null  object
 5   is_duplicate  298612 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 13.7+ MB


In [8]:
df1.isnull().sum()

id              0
qid1            0
qid2            0
question1       1
question2       2
is_duplicate    0
dtype: int64

In [9]:
df1=df1.dropna() 
df1.shape

(298609, 6)

In [10]:
df_sampled, _ = train_test_split(df1, stratify=df1['is_duplicate'], train_size=60000, random_state=42)
df_sampled.head(3)  

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
161576,224892,443249,443250,What should we do when we have a headache?,What do you do when you have a headache?,1
175458,392212,766577,766578,Which is the best laptop to buy under INR 45K?,Which is the best laptop to buy under 45k?,1
196723,16386,32704,32705,What is the best idea to start a business?,Which is the best idea to start the business?,1


In [11]:
df_sampled=df_sampled.reset_index(drop=True) 

In [12]:
y = df_sampled['is_duplicate'].values 

In [13]:
# Combine the two columns into a list of tuples
combined_questions = list(zip(df_sampled['question1'], df_sampled['question2'])) 

In [14]:
combined_questions[:2]

[('What should we do when we have a headache?',
  'What do you do when you have a headache?'),
 ('Which is the best laptop to buy under INR 45K?',
  'Which is the best laptop to buy under 45k?')]

In [15]:
# Train-test split
X_train, X_val, y_train, y_val = train_test_split(combined_questions, y, test_size=0.2, random_state=42)

In [16]:
X_train[:2]

[('Is giveawaylisting.com legit?', 'Is simplyeyeglasses.com legit?'),
 ('Should I be worried about the NWO/One World Government?',
  'My father had 45 and no job, how Can I help him?')]

In [17]:
np.array(X_train).shape

(48000, 2)

In [18]:
y_train[:2]

array([0, 0], dtype=int64)

In [19]:
# Tokenization using cached tokenizer (optional)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 

In [20]:
def tokenize_pair(question1, question2):
    inputs = tokenizer(question1, question2, padding='max_length', truncation=True,max_length=125, return_tensors='tf')
    return inputs['input_ids'], inputs['attention_mask'], inputs['token_type_ids']  

In [21]:
batch_X = X_train[1:1 + 4] 

In [22]:
batch_X

[('Should I be worried about the NWO/One World Government?',
  'My father had 45 and no job, how Can I help him?'),
 ('When a question is marked as "needing improvement" is it visible to others still and can I effectively fix it?',
  'Why are some of my questions being marked as "Needs Improvement" no matter what I do to improve it?'),
 ('What are the best places to visit in Wayanad, Kerala?',
  'What are the best places to visit on a 3 day trip in and around kerala?'),
 ("What's your idea about islam?",
  'What is your idea and thoughts about Islam and Muslims?')]

In [23]:
for i,j in batch_X:
    print(i)
    break

Should I be worried about the NWO/One World Government?


In [24]:
for i,j in batch_X:
    print(j)
    break

My father had 45 and no job, how Can I help him?


In [25]:
def create_tf_dataset(X, y, batch_size=16):
    input_ids, attention_masks, token_type_ids = [], [], []
    
    for q1, q2 in X:
        input_id, attention_mask, token_type_id = tokenize_pair(q1, q2)
        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
    
    # Flatten the lists
    input_ids = tf.concat(input_ids, axis=0)
    attention_masks = tf.concat(attention_masks, axis=0)
    token_type_ids = tf.concat(token_type_ids, axis=0)
    
    y = tf.convert_to_tensor(y, dtype=tf.int32)

    # Create dataset
    dataset = tf.data.Dataset.from_tensor_slices((
        {'input_ids': input_ids, 'attention_mask': attention_masks, 'token_type_ids': token_type_ids},
        y
    ))
    dataset = dataset.shuffle(buffer_size=10000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset 

In [26]:
# Create datasets
train_dataset = create_tf_dataset(X_train, y_train, batch_size=8)
val_dataset = create_tf_dataset(X_val, y_val, batch_size=8)  

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

In [27]:
train_dataset

<PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(None, 125), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(None, 125), dtype=tf.int32, name=None), 'token_type_ids': TensorSpec(shape=(None, 125), dtype=tf.int32, name=None)}, TensorSpec(shape=(None,), dtype=tf.int32, name=None))>

In [28]:
for sample in train_dataset.take(1):
    print(sample) 

({'input_ids': <tf.Tensor: shape=(8, 125), dtype=int32, numpy=
array([[  101,  2339,  2024,  5264,  6962, 15818,  1029,   102,  2024,
         5264,  6962, 15818,  1029,   102,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,

In [29]:
for sample in val_dataset.take(1):
    print(sample) 

({'input_ids': <tf.Tensor: shape=(8, 125), dtype=int32, numpy=
array([[  101,  2029,  2338,  2003,  2488,  2005,  4083,  4563,  9262,
         1029,   102,  2029,  2003,  2190,  2338,  2005,  9262,  1029,
          102,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,

In [30]:
# Load pre-trained BERT model
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1) 

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
# Freeze initial layers (optional)
for layer in bert_model.bert.encoder.layer[:10]:
    layer.trainable = False 

In [32]:
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

In [33]:
optimizer = Adam(learning_rate=1e-4, clipnorm=1.0,epsilon=1e-8)

In [34]:
loss_fn = BinaryCrossentropy(from_logits=True) 

In [35]:
# Compile the model
bert_model.compile(
    optimizer=optimizer, 
    loss=loss_fn, 
    metrics=['accuracy',Precision(),Recall()]
) 

In [36]:
bert_model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  769       
                                                                 
Total params: 109,483,009
Trainable params: 38,604,289
Non-trainable params: 70,878,720
_________________________________________________________________


In [37]:
# Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, min_lr=0.000001, verbose=1)

In [38]:
# Train the model
history = bert_model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=1,  # Adjust based on your needs
    callbacks=[early_stopping, reduce_lr],
    verbose=1
) 



In [47]:
bert_model.save(r'C:\Users\ASUS\Downloads\bert_model_main_epoch1', save_format='tf')



INFO:tensorflow:Assets written to: C:\Users\ASUS\Downloads\bert_model_main_epoch1\assets


INFO:tensorflow:Assets written to: C:\Users\ASUS\Downloads\bert_model_main_epoch1\assets


In [17]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Combine the two columns into a list of tuples
combined_questions = list(zip(df_sampled['question1'], df_sampled['question2']))
y = df_sampled['is_duplicate'].values

# Load SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode question pairs
def encode_pairs(pairs):
    embeddings = [model.encode(pair[0]) - model.encode(pair[1]) for pair in pairs]
    return np.array(embeddings)

# Encode all pairs
X = encode_pairs(combined_questions)

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Create TensorFlow Dataset
def create_tf_dataset(X, y, batch_size=32):
    dataset = tf.data.Dataset.from_tensor_slices((X, y))
    dataset = dataset.shuffle(buffer_size=10000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset

# Create datasets
train_dataset = create_tf_dataset(X_train, y_train)
val_dataset = create_tf_dataset(X_val, y_val)

# Define a simple neural network
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, min_lr=1e-6, verbose=1)

RuntimeError: Numpy is not available

In [None]:
# Train the model
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=5,
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

In [12]:
from sentence_transformers import SentenceTransformer

In [16]:
import numpy as np