---
# **Installing Dependencies**
---

In [None]:
!pip install -q neattext

---
# **Importing Necessary Dependencies**
---

In [9]:
import pandas as pd                   #package for data analysis
import numpy as np                    #package for fast data manipulations
import neattext as nt                 #package for text cleaning

In [10]:
df=pd.read_csv('amazon_reviews.csv')

In [11]:
df.columns

Index(['Unnamed: 0', 'marketplace', 'customer_id', 'review_id', 'product_id',
       'product_parent', 'product_title', 'product_category', 'star_rating',
       'helpful_votes', 'total_votes', 'vine', 'verified_purchase',
       'review_headline', 'review_body', 'review_date'],
      dtype='object')

### **Marking Review as Fake if its Lenght is 15 Characters or less, repeating, and Verified Purcahes=0**

In [12]:
df['fake_review']=df.apply(lambda x: 1 if len(str(x['review_body']))<=15 or x['verified_purchase']==0  else 0,axis=1)

In [13]:
df['fake_review'].value_counts()

0    100322
1     29678
Name: fake_review, dtype: int64

### **Checking Customer Id in the Review**

In [14]:
temp=df.apply(lambda x: 1 if str(x['customer_id']) in str(x['review_body']) else 0,axis=1)

In [15]:
temp.value_counts()  #no review that contain customer_id

0    130000
dtype: int64

### **Text Preprocessing**

In [16]:
def text_preprocessing(text):
  text=nt.fix_contractions(text)     #I'm -> I am
  text=nt.remove_urls(text)          #removing urls
  text=nt.remove_non_ascii(text)     #removing non-ascii characters
  text=nt.remove_userhandles(text)   #removing urserhandles
  text=nt.remove_hashtags(text)      #removing hashtags
  text=nt.remove_multiple_spaces(text)  #removing multiple spaces
  return text
  

In [17]:
df['cleaned_review_body']=df.review_body.apply(lambda x:text_preprocessing(str(x)))

### **Sentiment Analysis**

In [18]:
from textblob import TextBlob   #special package for short sentence sentiment analysis

In [19]:
def sentiment_polarity(text):
  '''
  this fucntion calculates polarity of each tweet
  '''
  text=TextBlob(text)
  return text.sentiment.polarity

In [20]:
df['sentiment_polarity']=df.cleaned_review_body.apply(lambda x:sentiment_polarity(x))

In [21]:
def sentiment_tag(polarity):
  '''
  this function assigns sentiment tag according to its polarity
  '''
  if polarity>0:
        return 'positive'
  elif polarity<0:
        return 'negative'
  else:
    return 'neutral'

In [22]:
df['sentiment_tag']=df['sentiment_polarity'].apply(lambda x:sentiment_tag(x))

### **Checking Sentiment and Review Rating Clash**

In [23]:
#Marking Reviews as Fake if it has 5 star rating but have negative setiment

In [24]:
temp=df.apply(lambda x: 1 if  x['sentiment_tag']=='negative' and x['star_rating']==5 else 0,axis=1)

In [25]:
temp.value_counts()

0    126739
1      3261
dtype: int64

In [26]:
indices=df[(df['sentiment_tag']=='negative') & (df['star_rating']==5)].index

In [27]:
df.loc[indices,'fake_review']=1

In [28]:
df['fake_review'].value_counts()

0    97511
1    32489
Name: fake_review, dtype: int64

In [29]:
df.to_csv('fake_reviews.csv')

### **Is product category mentioned in Review text**

In [30]:
temp=df.apply(lambda x: 1 if x['product_id'] in str(x['review_body']) else 0,axis=1)

In [31]:
temp.value_counts()

0    129890
1       110
dtype: int64

In [32]:
#only 110 reviews are with product category so it is not a good criteria for marking a review as fake or legit

In [33]:
df['fake_review'].value_counts()

0    97511
1    32489
Name: fake_review, dtype: int64