In [40]:
import pandas as pd
import re

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\scotjaco\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\scotjaco\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\scotjaco\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\scotjaco\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [41]:
data = pd.read_csv("data/reviews.csv")

In [42]:
data.head()

Unnamed: 0,Id,Review,Label
0,0,good and interesting,5
1,1,"This class is very helpful to me. Currently, I...",5
2,2,like!Prof and TAs are helpful and the discussi...,5
3,3,Easy to follow and includes a lot basic and im...,5
4,4,Really nice teacher!I could got the point eazl...,4


In [43]:
data = data.drop('Id', axis=1)

In [44]:
# defining a function to clean text of special and numerical characters
def clean(text):
    """Removes all special characters and numbers, leaving only alphabetical characters."""
    text = re.sub('[^A-Za-z]+', ' ', text)
    return text

data['Cleaned Reviews'] = data['Review'].apply(clean)
data.head()

In [49]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger')

# POS tagger dictionary
pos_dict = {'J':wordnet.ADJ, 'V':wordnet.VERB, 'N':wordnet.NOUN, 'R':wordnet.ADV}

# function to tokenize, remove stopwords, and add part of speech data
def token_stop_pos(text):
    tags = pos_tag(word_tokenize(text))
    newlist = []
    for word, tag in tags:
        if word.lower() not in set(stopwords.words('english')):
            newlist.append(tuple([word, pos_dict.get(tag[0])]))
    return newlist

data['POS tagged'] = data['Cleaned Reviews'].apply(token_stop_pos)
data.head()

Unnamed: 0,Review,Label,Cleaned Reviews,POS tagged
0,good and interesting,5,good and interesting,"[(good, a), (interesting, a)]"
1,"This class is very helpful to me. Currently, I...",5,This class is very helpful to me Currently I m...,"[(class, n), (helpful, a), (Currently, n), (st..."
2,like!Prof and TAs are helpful and the discussi...,5,like Prof and TAs are helpful and the discussi...,"[(like, None), (Prof, n), (TAs, n), (helpful, ..."
3,Easy to follow and includes a lot basic and im...,5,Easy to follow and includes a lot basic and im...,"[(Easy, n), (follow, v), (includes, v), (lot, ..."
4,Really nice teacher!I could got the point eazl...,4,Really nice teacher I could got the point eazl...,"[(Really, r), (nice, a), (teacher, n), (could,..."


In [53]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

# function to lemmatize the POS tagged data
def lemmatize(pos_data):
    lemma_rew = " "
    for word, pos in pos_data:
        if not pos:
            lemma = word
            lemma_rew = lemma_rew + " " + lemma
        else:
            lemma = wordnet_lemmatizer.lemmatize(word, pos=pos)
            lemma_rew = lemma_rew + " " + lemma
    return lemma_rew

data['Lemma'] = data['POS tagged'].apply(lemmatize)
data.head()

Unnamed: 0,Review,Label,Cleaned Reviews,POS tagged,Lemma
0,good and interesting,5,good and interesting,"[(good, a), (interesting, a)]",good interesting
1,"This class is very helpful to me. Currently, I...",5,This class is very helpful to me Currently I m...,"[(class, n), (helpful, a), (Currently, n), (st...",class helpful Currently still learn class ma...
2,like!Prof and TAs are helpful and the discussi...,5,like Prof and TAs are helpful and the discussi...,"[(like, None), (Prof, n), (TAs, n), (helpful, ...",like Prof TAs helpful discussion among stude...
3,Easy to follow and includes a lot basic and im...,5,Easy to follow and includes a lot basic and im...,"[(Easy, n), (follow, v), (includes, v), (lot, ...",Easy follow include lot basic important tech...
4,Really nice teacher!I could got the point eazl...,4,Really nice teacher I could got the point eazl...,"[(Really, r), (nice, a), (teacher, n), (could,...",Really nice teacher could get point eazliy v


In [55]:
data[['Review', 'Lemma']]

Unnamed: 0,Review,Lemma
0,good and interesting,good interesting
1,"This class is very helpful to me. Currently, I...",class helpful Currently still learn class ma...
2,like!Prof and TAs are helpful and the discussi...,like Prof TAs helpful discussion among stude...
3,Easy to follow and includes a lot basic and im...,Easy follow include lot basic important tech...
4,Really nice teacher!I could got the point eazl...,Really nice teacher could get point eazliy v
...,...,...
107013,Trendy topic with talks from expertises in the...,Trendy topic talk expertise field Covered ar...
107014,"Wonderful! Simple and clear language, good ins...",Wonderful Simple clear language good instruc...
107015,an interesting and fun course. thanks. dr quincy,interesting fun course thanks dr quincy
107016,"very broad perspective, up to date information...",broad perspective date information useful li...


In [57]:
from textblob import TextBlob

# function to calculate subjectivity
def getSubjectivity(review):
    return TextBlob(review).sentiment.subjectivity
    # function to calculate polarity
def getPolarity(review):
    return TextBlob(review).sentiment.polarity
    
# function to analyze the reviews
def analysis(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'

In [62]:
final_data = pd.DataFrame(data[['Review', 'Lemma']])
final_data['Subjectivity'] = final_data['Lemma'].apply(getSubjectivity)
final_data['Polarity'] = final_data['Lemma'].apply(getPolarity) 
final_data['Analysis'] = final_data['Polarity'].apply(analysis)

In [66]:
final_data.sort_values(by=['Polarity'], ascending=True)

Unnamed: 0,Review,Lemma,Subjectivity,Polarity,Analysis
40890,Course content sometimes too boring,Course content sometimes boring,1.0,-1.0,Negative
59150,Insane knowledge in this course for anyone who...,Insane knowledge course anyone want get touc...,1.0,-1.0,Negative
49761,Boring,Boring,1.0,-1.0,Negative
5566,"Boring, didn't have strength to finish",Boring strength finish,1.0,-1.0,Negative
32674,Horrible class. Shame it's in the series.,Horrible class Shame series,1.0,-1.0,Negative
...,...,...,...,...,...
26455,wonderful course. Thank you.,wonderful course Thank,1.0,1.0,Positive
34133,One of the best course through Coursera,One best course Coursera,0.3,1.0,Positive
34136,An excellent course it helped me to think and ...,excellent course help think review,1.0,1.0,Positive
96071,An excellent resource for violin teachers!!!,excellent resource violin teacher,1.0,1.0,Positive


In [None]:
final_data