In [1]:
import os
import json
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize, word_tokenize
import string
import editdistance
from textstat import flesch_reading_ease
import nltk
import MoreThanSentiments as mts
from nltk.util import pairwise 
from vaderSentiment.vaderSentiment import NEGATE, BOOSTER_DICT
import math
import re
from itertools import product
import nltk.data
from vaderSentiment.vaderSentiment import NEGATE, BOOSTER_DICT

✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
# Set the working directory
os.chdir('C:/Users/jesse/OneDrive/Documenten/Thesis/amazon_code/dataframes_done')

In [3]:
# Load the reviews from the JSON file
with open('output.json', 'r') as f:
    reviews = json.load(f)

# DataFrame
reviews_df = pd.DataFrame(reviews)

In [5]:
# Quantity
result_quantity = []

for review in reviews:
    text = review['Review_Text']
    # Number of words
    words = text.split()
    num_words = len(words)
    # Number of sentences
    sentences = text.split('.')
    num_sentences = len(sentences)
    # Number of caps
    num_caps = sum(1 for c in text if c.isupper())
    # Number of punctuation
    num_punctuation = sum(text.count(p) for p in string.punctuation)
     # Part of speech
    pos_tags = nltk.pos_tag(words)
    noun_count = len([word for word, pos in pos_tags if pos in ['NN', 'NNS', 'NNP', 'NNPS']])
    verb_count = len([word for word, pos in pos_tags if pos in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']])
    adj_count = len([word for word, pos in pos_tags if pos in ['JJ', 'JJR', 'JJS']])
    adv_count = len([word for word, pos in pos_tags if pos in ['RB', 'RBR', 'RBS']])


    # linguistic features results
    result_quantity.append({
        'Number_of_words': num_words,
        'Number_of_sentences': num_sentences,
        'Number_of_caps': num_caps,
        'Number_of_punctuation': num_punctuation,
        'Number_of_nouns': noun_count,
        'Number_of_verbs': verb_count,
        'Number_of_adjectives': adj_count,
        'Number_of_adverbs': adv_count
    })

# Create a result dataframe
df_qua_VADER = pd.DataFrame(result_quantity)

# Dataframe to Json file
df_qua_VADER.to_json('df_qua_VADER.json', orient='records')

In [11]:
# Complexity

# Redundancy function
def calculate_redundancy(text):
    words = text.split()
    
    # Sum of Levenshtein distances between all pairs of words
    total_distance = sum(editdistance.eval(w1, w2) for i, w1 in enumerate(words) for j, w2 in enumerate(words) if i < j)
    
    # Average Levenshtein distance
    n = len(words)
    if n > 1:
        average_distance = total_distance / (n * (n - 1) / 2)
    else:
        average_distance = 0
    
    # Return the redundancy
    return 1 - average_distance / len(max(words, key=len))

results_complexity = []

for review in reviews:
    text = review['Review_Text']
    # Number of words
    words = text.split()
    num_words = len(words)
    # Number of sentences
    sentences = sent_tokenize(text)
    num_sentences = len(sentences)
    # Average word length
    total_word_length = sum(len(word) for word in words)
    avg_word_length = total_word_length / num_words
    # Average sentence length
    total_sentence_length = sum(len(sent) for sent in sentences)
    avg_sentence_length = total_sentence_length / num_sentences
    # Redundance score
    redundancy = calculate_redundancy(text)
    # Readability score
    readability_score = flesch_reading_ease(text)

    # linguistic features results
    results_complexity.append({
        'Average_word_length': avg_word_length,
        'Average_sentence_length': avg_sentence_length,
        'Redundancy_score': redundancy,
        'Readability_score': readability_score,
})

# Result dataframe
df_com_VADER = pd.DataFrame(results_complexity)

# Dataframe to Json file
df_com_VADER.to_json('df_com_VADER.json', orient='records')

KeyboardInterrupt: 

In [16]:
# Diversity
results_diversity = []

for review in reviews:
    text = review['lemmatized']
    words = text
    num_words = len(words)
    # Lexical diversity
    unique_words = set(words)
    lexical_diversity = len(unique_words) / num_words

    # Linguistic features results
    results_diversity.append({
        'Lexical_diversity': lexical_diversity
    })


# Results dataframe
df_div_VADER = pd.DataFrame(results_diversity)

# Dataframe to Json file
df_div_VADER.to_json('df_div_VADER.json', orient='records')

Unnamed: 0,Lexical_diversity
0,0.851852
1,0.727273
2,0.584615
3,0.764706
4,0.545455


In [23]:
# Emotion

# VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Sentiment score function
def get_sentiment_scores(text):
    scores = sia.polarity_scores(text)
    return scores['compound'], scores['pos'], scores['neg'], scores['neu']

# Polarity function
def get_polarity_categories(text):
    words = text.split()
    polarities = {'pos': 0, 'neg': 0, 'neu': 0}
    for word in words:
        scores = sia.polarity_scores(word)
        for key in polarities.keys():
            if scores[key] > 0:
                polarities[key] += 1
    return polarities

# Polarity shifters function
def count_polarity_shifters(text):
    shifters = ['but', 'however', 'although', 'yet', 'nevertheless']
    count = 0
    for word in text:
        if word.lower() in shifters:
            count += 1
    return count

# Intensity modifiers function
intensity_modifier_words = BOOSTER_DICT
def count_intensity_modifiers(text):
    modifiers = intensity_modifier_words
    count = 0
    for word in text:
        if word.lower() in modifiers:
            count += 1
    return count

# Negations function
negation_words = NEGATE
def count_negations(text):
    negations = negation_words 
    count = 0
    for word in text:
        if word.lower() in negations:
            count += 1
    return count

def count_emoticons(text):
    sid = SentimentIntensityAnalyzer()
    emoticons = re.findall(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    scores = sid.polarity_scores(text)
    num_emoticons = len(emoticons)
    return num_emoticons

# Empty dataframe to store results
columns = ['sentiment_score', 'positive_score', 'negative_score', 'neutral_score',            
           'positive_words', 'negative_words', 'neutral_words',           
           'polarity_shifters', 'intensity_modifiers', 'negations', 'emoticons']
df_emo_VADER = pd.DataFrame(columns=columns)

# Loop over reviews to extract features
for index, row in reviews_df.iterrows():
    text = row['lemmatized']
    text2 = row['Review_Text']
    sentiment_score, positive_score, negative_score, neutral_score = get_sentiment_scores(text2)
    polarities = get_polarity_categories(text2)
    polarity_shifters = count_polarity_shifters(text)
    intensity_modifiers = count_intensity_modifiers(text)
    negations = count_negations(text)
    emoticons = count_emoticons(text2)
    row_results = [sentiment_score, positive_score, negative_score, neutral_score, 
                   polarities['pos'], polarities['neg'], polarities['neu'],
                   polarity_shifters, intensity_modifiers, negations, emoticons]
    df_emo_VADER.loc[index] = row_results

# write the dataframe to a Json file
df_emo_VADER.to_json('df_emo_VADER.json', orient='records')

Unnamed: 0,sentiment_score,positive_score,negative_score,neutral_score,positive_words,negative_words,neutral_words,polarity_shifters,intensity_modifiers,negations,emoticons
0,0.7269,0.217,0.0,0.783,2.0,0.0,22.0,0.0,1.0,0.0,0.0
1,0.8019,0.137,0.06,0.803,2.0,3.0,65.0,1.0,1.0,2.0,0.0
2,0.7557,0.165,0.053,0.782,6.0,0.0,45.0,1.0,3.0,3.0,0.0
3,0.6588,0.099,0.0,0.901,1.0,0.0,40.0,0.0,3.0,1.0,0.0
4,0.7964,0.116,0.0,0.884,3.0,0.0,62.0,0.0,1.0,0.0,0.0


In [None]:
# Set the working directory
os.chdir('C:/Users/jesse/OneDrive/Documenten/Thesis/amazon_code/dataframes')

# Load dataframes
df_qua_VADER = pd.read_json('df_qua_VADER.json')
df_com_VADER = pd.read_json('df_com_VADER.json')
df_div_VADER = pd.read_json('df_div_VADER.json')
df_emo_VADER = pd.read_json('df_emo_VADER.json')

# Concatenate dataframes
VADER_df = pd.concat([reviews_df[['Label']], df_qua_VADER, df_com_VADER, df_div_VADER, df_emo_VADER], axis=1)

# Dataframe to Json file
VADER_df.to_json('VADER_df.json', orient='records')

VADER_df.head()

In [None]:
# Rename the labels
VADER_df['Label'] = VADER_df['Label'].replace({'__label1__': 'fake', '__label2__': 'real'})

# Group the data by label and calculate statistics
statistics_VADER = VADER_df.groupby('Label').describe().transpose()

# Print statistics
print(statistics_VADER)

statistics_VADER.to_excel('statistics_VADER.xlsx')

Label                    fake          real
DOC_ID    count  10500.000000  10500.000000
          mean    5250.500000  15750.500000
          std     3031.233247   3031.233247
          min        1.000000  10501.000000
          25%     2625.750000  13125.750000
...                       ...           ...
emoticons min        0.000000      0.000000
          25%        0.000000      0.000000
          50%        0.000000      0.000000
          75%        0.000000      0.000000
          max        4.000000      6.000000

[208 rows x 2 columns]
