In [None]:
import pandas as pd
import re
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from keras.callbacks import EarlyStopping

In [None]:
import os
print(os.listdir())
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,Input,BatchNormalization
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam

In [None]:
df=pd.read_csv('training.1600000.processed.noemoticon.csv',encoding='latin-1')

In [None]:
df.columns=['target','id','x','y','z','txt']

In [None]:
sample=df.sample(n=100000)
sample=sample.drop(columns=['id','x','y','z'])

In [None]:
def clean(x):
  if not isinstance(x,str):
      return ''
  x=x.lower()
  x=re.sub(r'http\S+','',x)
  x=re.sub(r'@\w+','',x)
  x=re.sub(r'#','',x)
  x=re.sub(r'\s+',' ',x)
  x=re.sub(r'[^a-z\s\']','',x)
  x=re.sub(r'(.)\1,{2,}',r'\1\1',x)
  x=x.strip()
  return x

In [None]:
def minimize(x):
  if x<2:
   return 0
  else:
   return 1

In [None]:
sample['target']=sample['target'].apply(minimize)


In [None]:
plt.bar(sample['target'].value_counts().index,sample['target'].value_counts())
plt.show()

In [None]:
sample['txt']=sample['txt'].apply(clean)
sample=sample[sample['txt']!='']
sample=sample.drop_duplicates(subset=['txt'])
sample=sample.dropna(subset=['txt'])

In [None]:
print(sample['txt'].sample(10).tolist())
print(sample.shape)

In [None]:
st=SentenceTransformer('all-mpnet-base-v2')

In [None]:
txt=sample['txt'].tolist()

In [None]:
embeddings=st.encode(txt,
                     show_progress_bar=True,
                     batch_size=64)

In [None]:
print(np.array(embeddings).shape)

In [None]:
xtrain,xtest,ytrain,ytest=train_test_split(embeddings,sample['target'],
                                           test_size=0.2,
                                           random_state=42)

In [None]:
Bert_model=Sequential([
    Input(shape=(xtrain.shape[1],)),
    Dense(512,activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    Dense(256,activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    Dense(128,activation='relu'),
    BatchNormalization(),
    Dropout(0.2),
    Dense(1,activation='sigmoid')])

In [None]:
lrn_rate=Adam(learning_rate=2e-5)

In [None]:
Bert_model.compile(optimizer=lrn_rate,
                   loss='binary_crossentropy',
                   metrics=['accuracy'])

In [None]:
early=EarlyStopping(monitor='val_loss',
                    patience=2,
                    restore_best_weights=True)

In [None]:
history1=Bert_model.fit(xtrain,ytrain,
                       epochs=10,batch_size=32,
                       validation_split=0.1,
                       callbacks=[early],
                       verbose=1)

In [None]:
plt.plot(history1.history['accuracy'],label='train')
plt.plot(history1.history['val_accuracy'],label='val accuracy')
plt.plot(history1.history['loss'],label='loss')
plt.plot(history1.history['val_loss'],label='val loss')
plt.legend()
plt.show()

In [None]:
temp=Bert_model.predict(xtest)

In [None]:
ypredict=(temp>=0.5).astype(int)

In [None]:
ac=accuracy_score(ytest,ypredict)
print(ac)