In [None]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import download
import re

download('stopwords')
download('punkt')
download('wordnet')
download('omw-1.4')

In [None]:
df = pd.read_csv('20191226-reviews.csv', usecols=['body'])

# Only take the forst 10 reviews from the large dateset
df = df[:10]

lemma = WordNetLemmatizer()
stop_words = stopwords.words('english')

df.head()

In [5]:
def text_prep(text):

    # Make the entire text lowercase
    corpus = str(text).lower()

    # Use regular expression to replace any non alphabetic character (a-z) with spaces
    corpus = re.sub('[^a-z]+', ' ', corpus)

    # Remove any leading or trailing space
    corpus = corpus.strip()

    # Tokenize the text
    tokens = word_tokenize(corpus)

    words = []
    for t in tokens:
        if t not in stop_words:
            words.append(t)

    lemmatized_words = []
    for w in words:
        lemmatized_words.append(lemma.lemmatize(w))

    return lemmatized_words

In [None]:
# Preprocessing all the revies in the dataset

lemmatized_reviews = []

for review in df['body']:
    lemmatized_review = text_prep(review)
    lemmatized_reviews.append(lemmatized_review)

df['lemmatized_text'] = lemmatized_reviews

df.head()

In [16]:
# getting the collection of positive and negetive words

file = open('negative-words.txt', 'r')
neg_words = file.read().split()

file = open('positive-words.txt', 'r')
pos_words = file.read().split()


In [None]:
# Counting the posiive and negetive words in the dataset

pos_counts = []
neg_counts = []
sentiments = []

for text in df['lemmatized_text']:
    pos_count = 0
    neg_count = 0
    
    for word in text:
        if word in pos_words:
            pos_count += 1
        elif word in neg_words:
            neg_count += 1
    
    pos_counts.append(pos_count)
    neg_counts.append(neg_count)
    sentiments.append(round((pos_count - neg_count) / (pos_count + neg_count) , 2))


df['pos_count'] = pos_counts
df['neg_count'] = neg_counts
df['sentiment'] = sentiments

df.head()