In [None]:
#!pip install -q transformers
#!pip install -q neattext

In [None]:
#!pip install -q protobuf==3.19.3

In [None]:
import os
import pandas as pd
import transformers
import numpy as np
import neattext as nt
import tensorflow as tf
from transformers import TFBertModel, BertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertTokenizerFast, TFBertForSequenceClassification, BertConfig
import tensorflow as tf


#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
#path = r"/content/drive/MyDrive/fake_reviews"
#s=os.chdir(path)

In [None]:
# Load the dataset
df = pd.read_csv('amazon_reviews.csv')
df.dropna(inplace=True)
df.head(2)

### **Marking Review as Fake if its Lenght is 15 Characters or less, repeating, and Verified Purcahes=0**

In [None]:
df['fake_review']=df.apply(lambda x: 1 if len(str(x['review_body']))<=15 or x['verified_purchase']==0  else 0,axis=1)

### **Checking Customer Id in the Review**

In [None]:
temp=df.apply(lambda x: 1 if str(x['customer_id']) in str(x['review_body']) else 0,axis=1)
temp.value_counts()

### **Text Preprocessing**

In [None]:
def text_preprocessing(text):
  text=nt.fix_contractions(text)     #I'm -> I am
  text=nt.remove_urls(text)          #removing urls
  text=nt.remove_non_ascii(text)     #removing non-ascii characters
  text=nt.remove_numbers(text)       #removing numbers
  text=nt.remove_multiple_spaces(text)  #removing multiple spaces
  return text
  

In [None]:
df['cleaned_review_body']=df.review_body.apply(lambda x:text_preprocessing(str(x)))

### **Sentiment Analysis**

In [None]:
from textblob import TextBlob   #special package for short sentence sentiment analysis

In [None]:
def sentiment_polarity(text):
  '''
  this fucntion calculates polarity of each tweet
  '''
  text=TextBlob(text)
  return text.sentiment.polarity

In [None]:
df['sentiment_polarity']=df.cleaned_review_body.apply(lambda x:sentiment_polarity(x))

In [None]:
def sentiment_tag(polarity):
  '''
  this function assigns sentiment tag according to its polarity
  '''
  if polarity>0:
        return 'positive'
  elif polarity<0:
        return 'negative'
  else:
    return 'neutral'

In [None]:
df['sentiment_tag']=df['sentiment_polarity'].apply(lambda x:sentiment_tag(x))

### **Checking Sentiment and Review Rating Clash**

In [None]:
#Marking Reviews as Fake if it has 5 star rating but have negative setiment

In [None]:
temp=df.apply(lambda x: 1 if  x['sentiment_tag']=='negative' and x['star_rating']==5 else 0,axis=1)

In [None]:
temp.value_counts()

In [None]:
indices=df[(df['sentiment_tag']=='negative') & (df['star_rating']==5)].index

In [None]:
df.loc[indices,'fake_review']=1

In [None]:
df['fake_review'].value_counts()

### **Is product category mentioned in Review text**

In [None]:
temp=df.apply(lambda x: 1 if x['product_id'] in str(x['review_body']) else 0,axis=1)

In [None]:
temp.value_counts()

In [None]:
def postprocessing(text):
  text=nt.remove_stopwords(text)
  text=nt.remove_special_characters(text)
  text=nt.remove_emojis(text)
  text=text=nt.remove_shortwords(text,3)
  return text

In [None]:
df['cleaned_review_body']=df['cleaned_review_body'].apply(lambda x:postprocessing(str(x)))

In [None]:
fake_df=df[df["fake_review"]==1]

In [None]:
legit_df=df[df["fake_review"]==0][:fake_df.shape[0]]

In [None]:
combine_df=pd.concat([fake_df,legit_df])

In [None]:
combine_df["fake_review"].value_counts()

In [None]:
# Preprocess the text using HuggingFace's BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# Split the dataset into training and validation sets
train_df, val_df = train_test_split(combine_df, test_size=0.2,random_state=42)

# Tokenize the datasets
train_encodings = tokenizer(train_df['review_body'].to_list(), padding=True, truncation=True, max_length=512, return_tensors='tf')
val_encodings = tokenizer(val_df['review_body'].to_list(), padding=True, truncation=True, max_length=512, return_tensors='tf')


In [None]:
# Prepare the TensorFlow Dataset
train_labels = train_df['fake_review'].tolist()
val_labels = val_df['fake_review'].tolist()
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), train_labels))
val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_encodings), val_labels))


In [None]:
# Load the pre-trained BERT model for sequence classification
configuration = BertConfig(num_labels=2)
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', config=configuration)


In [None]:
# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5), loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])


In [None]:
# Train the model
model.fit(train_dataset.batch(16), epochs=3, batch_size=16, validation_data=val_dataset.batch(16))


In [None]:
# Evaluate the model
val_predictions = model.predict(val_encodings)
val_preds = np.argmax(val_predictions.logits, axis=-1)
accuracy = accuracy_score(val_labels, val_preds)
print("Accuracy:", accuracy)