In [1]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from wordcloud import WordCloud

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')

stopwords = set(stopwords.words('english'))
sns.set_theme()

[nltk_data] Downloading package punkt to /home/vdubey/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/vdubey/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/vdubey/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/vdubey/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [None]:
reviews_df = pd.read_csv('../data/fake_reviews.csv')
reviews_df = reviews_df.rename(columns = {'text_': 'text'})
reviews_df['category'] = reviews_df['category'].apply(lambda s: s[:-2].replace('_', ' '))
reviews_df['rating'] = reviews_df['rating'].astype(int)

reviews_df['text_no_stop'] = reviews_df['text'].apply(lambda s: ' '.join([token for token in word_tokenize(s.lower()) if token not in stopwords]))
reviews_df['text_no_punct'] = reviews_df['text'].apply(lambda s: s.lower().translate(str.maketrans('', '', string.punctuation)))

In [None]:
def preprocess_text(s):
    tokens = word_tokenize(s.lower())
    filtered_tokens = [token for token in tokens if token not in stopwords]
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    processed_text = ' '.join(lemmatized_tokens)
    return processed_text

In [None]:
reviews_df['lemma_text'] = reviews_df['text_no_punct'].apply(preprocess_text)

In [None]:
sent_analyzer = SentimentIntensityAnalyzer()
reviews_df['sentiment'] = reviews_df['lemma_text'].apply(lambda s: sent_analyzer.polarity_scores(s))
reviews_df['neg_sentiment'] = reviews_df.sentiment.apply(lambda dc: dc['neg'])
reviews_df['pos_sentiment'] = reviews_df.sentiment.apply(lambda dc: dc['pos'])
reviews_df['neu_sentiment'] = reviews_df.sentiment.apply(lambda dc: dc['neu'])
reviews_df['comp_sentiment'] = reviews_df.sentiment.apply(lambda dc: dc['compound'])

reviews_df.head()

In [None]:
reviews_df['num_words'] = reviews_df['text_no_punct'].apply(lambda s: len(s.split(' ')))

In [None]:
def plot_word_cloud(words_ser):
    all_words = (" ".join(words_ser) + ' ').replace('  ', ' ')
    wordcloud = WordCloud(width = 800, height = 800,
                background_color ='white',
                stopwords = stopwords,
                min_font_size = 10).generate(all_words)

    plt.figure(figsize = (8, 8), facecolor = None)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.tight_layout(pad = 0)

    plt.show() 

In [None]:
cg_reviews = reviews_df[reviews_df.label == 'CG']
or_reviews = reviews_df[reviews_df.label == 'OR']

In [None]:
plot_word_cloud(reviews_df.text_no_punct)

In [None]:
fake_reviews = cg_reviews.text_no_punct
plot_word_cloud(fake_reviews)

In [None]:
real_reviews = or_reviews.text_no_punct
plot_word_cloud(real_reviews)

In [None]:
bins = np.arange(0, 375, 25)
plt.figure(figsize=(12, 8))
plt.hist(cg_reviews.num_words,
         label = 'computer generated reviews',
         alpha = 0.5,
         bins = bins,
         density=True)

plt.hist(or_reviews.num_words,
         label = 'original reviews',
         alpha = 0.5,
         bins = bins,
         density=True)

plt.legend()
plt.xlabel('number of words')
plt.ylabel('density')
plt.show()

In [None]:
from scipy.stats import ks_2samp

print(f'Median num words (CG): {cg_reviews.num_words.median()}')
print(f'Median num words (OR): {or_reviews.num_words.median()}')
print()

#testing if original reviews have more words than computer generated reviews
res = ks_2samp(cg_reviews.num_words, or_reviews.num_words, alternative='greater')

print(f'Test Stat: {res[0]}')
print(f'P-value: {res[1]}')

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

axes[0, 0].hist(cg_reviews.comp_sentiment, alpha=0.5, label='computer generated reviews', density=True)
axes[0, 0].hist(or_reviews.comp_sentiment, alpha=0.5, label='original reviews', density=True)
axes[0, 0].set_xlabel('sentiment')
axes[0, 0].set_ylabel('density')
axes[0, 0].legend()

axes[0, 1].hist(cg_reviews.pos_sentiment, alpha=0.5, label='computer generated reviews', density=True)
axes[0, 1].hist(or_reviews.pos_sentiment, alpha=0.5, label='original reviews', density=True)
axes[0, 1].set_xlabel('positive sentiment')
axes[0, 1].set_ylabel('density')
axes[0, 1].legend()

axes[1, 0].hist(cg_reviews.neu_sentiment, alpha=0.5, label='computer generated reviews', density=True)
axes[1, 0].hist(or_reviews.neu_sentiment, alpha=0.5, label='original reviews', density=True)
axes[1, 0].set_xlabel('neutral sentiment')
axes[1, 0].set_ylabel('density')
axes[1, 0].legend()

axes[1, 1].hist(cg_reviews.neg_sentiment, alpha=0.5, label='computer generated reviews', density=True)
axes[1, 1].hist(or_reviews.neg_sentiment, alpha=0.5, label='original reviews', density=True)
axes[1, 1].set_xlabel('negative sentiment')
axes[1, 1].set_ylabel('density')
axes[1, 1].legend()

plt.tight_layout()
plt.show()

In [None]:
print(f'Median compound sentiment (CG): {cg_reviews.comp_sentiment.median()}')
print(f'Median compound sentiment (OR): {or_reviews.comp_sentiment.median()}')
print()

#testing if computer-generated reviews have more compound sentiment than original reviews
res = ks_2samp(cg_reviews.comp_sentiment, or_reviews.comp_sentiment, alternative='less')

print(f'Test Stat: {res[0]}')
print(f'P-value: {res[1]}')

In [None]:
print(f'Median compound sentiment (CG): {cg_reviews.neg_sentiment.median()}')
print(f'Median compound sentiment (OR): {or_reviews.neg_sentiment.median()}')
print()

#testing if computer-generated reviews have less negative sentiment than original reviews
res = ks_2samp(cg_reviews.neg_sentiment, or_reviews.neg_sentiment, alternative='greater')

print(f'Test Stat: {res[0]}')
print(f'P-value: {res[1]}')

In [None]:
def plot_sentiment_by_rating(emotion):
    fig, axes = plt.subplots(1, 2, figsize=(16, 8))

    sns.boxplot(ax=axes[0], x='rating', y=f'{emotion}_sentiment', data=or_reviews)
    axes[0].set_title('OR Reviews')

    sns.boxplot(ax=axes[1], x='rating', y=f'{emotion}_sentiment', data=cg_reviews)
    axes[1].set_title('CG Reviews')

    plt.tight_layout()
    plt.show() 

In [None]:
plot_sentiment_by_rating('comp')

In [None]:
plot_sentiment_by_rating('pos')

In [None]:
plot_sentiment_by_rating('neu')

In [None]:
plot_sentiment_by_rating('neg')