In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import download
import re

download('stopwords')
download('punkt')
download('wordnet')
download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
df = pd.read_csv('20191226-reviews.csv', usecols=['body'])

# Only take the forst 10 reviews from the large dateset
df = df[:10]

lemma = WordNetLemmatizer()
stop_words = stopwords.words('english')

df.head()

Unnamed: 0,body
0,I had the Samsung A600 for awhile which is abs...
1,Due to a software issue between Nokia and Spri...
2,"This is a great, reliable phone. I also purcha..."
3,"I love the phone and all, because I really did..."
4,The phone has been great for every purpose it ...


In [3]:
def text_prep(text):

    # Make the entire text lowercase
    corpus = str(text).lower()

    # Use regular expression to replace any non alphabetic character (a-z) with spaces
    corpus = re.sub('[^a-z]+', ' ', corpus)

    # Remove any leading or trailing space
    corpus = corpus.strip()

    # Tokenize the text
    tokens = word_tokenize(corpus)

    words = []
    for t in tokens:
        if t not in stop_words:
            words.append(lemma.lemmatize(t))

    return words

In [4]:
# Preprocessing all the revies in the dataset

lemmatized_reviews = []

for review in df['body']:
    lemmatized_review = text_prep(review)
    lemmatized_reviews.append(lemmatized_review)

df['lemmatized_text'] = lemmatized_reviews

df.head()

Unnamed: 0,body,lemmatized_text
0,I had the Samsung A600 for awhile which is abs...,"[samsung, awhile, absolute, doo, doo, read, re..."
1,Due to a software issue between Nokia and Spri...,"[due, software, issue, nokia, sprint, phone, t..."
2,"This is a great, reliable phone. I also purcha...","[great, reliable, phone, also, purchased, phon..."
3,"I love the phone and all, because I really did...","[love, phone, really, need, one, expect, price..."
4,The phone has been great for every purpose it ...,"[phone, great, every, purpose, offer, except, ..."


In [5]:
# getting the collection of positive and negetive words

file = open('negative-words.txt', 'r')
neg_words = file.read().split()

file = open('positive-words.txt', 'r')
pos_words = file.read().split()


In [6]:
# Counting the posiive and negetive words in the dataset

pos_counts = []
neg_counts = []
sentiments = []

for text in df['lemmatized_text']:
    pos_count = 0
    neg_count = 0
    
    for word in text:
        if word in pos_words:
            pos_count += 1
        elif word in neg_words:
            neg_count += 1
    
    pos_counts.append(pos_count)
    neg_counts.append(neg_count)
    sentiments.append(round((pos_count - neg_count) / (pos_count + neg_count) , 2))


df['pos_count'] = pos_counts
df['neg_count'] = neg_counts
df['sentiment'] = sentiments

df.head()

Unnamed: 0,body,lemmatized_text,pos_count,neg_count,sentiment
0,I had the Samsung A600 for awhile which is abs...,"[samsung, awhile, absolute, doo, doo, read, re...",18,18,0.0
1,Due to a software issue between Nokia and Spri...,"[due, software, issue, nokia, sprint, phone, t...",8,3,0.45
2,"This is a great, reliable phone. I also purcha...","[great, reliable, phone, also, purchased, phon...",10,4,0.43
3,"I love the phone and all, because I really did...","[love, phone, really, need, one, expect, price...",3,0,1.0
4,The phone has been great for every purpose it ...,"[phone, great, every, purpose, offer, except, ...",5,3,0.25
