In [13]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
df = pd.read_table('../../data/labeledTrainDataNLP.tsv')
df.head(10)

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...
5,8196_8,1,I dont know why people think this is such a ba...
6,7166_2,0,"This movie could have been very good, but come..."
7,10633_1,0,I watched this video at a friend's house. I'm ...
8,319_1,0,"A friend of mine bought this film for £1, and ..."
9,8713_10,1,<br /><br />This movie is full of references. ...


In [14]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5705 entries, 0 to 5704
Data columns (total 3 columns):
id           5705 non-null object
sentiment    5705 non-null int64
review       5705 non-null object
dtypes: int64(1), object(2)
memory usage: 133.8+ KB
None


### 1.CLEANING DATA BY REMOVING PUNCTUATION EXCEPT FULLSTOPS

In [15]:
import nltk
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [16]:
def rempunc(text):
    text_nopunc="".join([char for char in text if char not in '!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~'])
    return (text_nopunc)

In [20]:
df['review_clean']=df['review'].apply(lambda x: rempunc(x))
df.head()

Unnamed: 0,id,sentiment,review,review_clean
0,5814_8,1,With all this stuff going down at the moment w...,With all this stuff going down at the moment w...
1,2381_9,1,The Classic War of the Worlds by Timothy Hines...,The Classic War of the Worlds by Timothy Hines...
2,7759_3,0,The film starts with a manager Nicholas Bell g...,The film starts with a manager Nicholas Bell g...
3,3630_4,0,It must be assumed that those who praised this...,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,Superbly trashy and wondrously unpretentious 8...


### 2.SENTENCE TOKENIZING AND SAVING SENTENCES IN SEPERATE JSON FILE

In [21]:
from nltk import sent_tokenize

In [22]:
def div2sent(text):
    sent_text=sent_tokenize(text)
    return (sent_text)

In [23]:
df['review_sent']=df['review_clean'].apply(lambda x: div2sent(x))
df.head()

Unnamed: 0,id,sentiment,review,review_clean,review_sent
0,5814_8,1,With all this stuff going down at the moment w...,With all this stuff going down at the moment w...,[With all this stuff going down at the moment ...
1,2381_9,1,The Classic War of the Worlds by Timothy Hines...,The Classic War of the Worlds by Timothy Hines...,[The Classic War of the Worlds by Timothy Hine...
2,7759_3,0,The film starts with a manager Nicholas Bell g...,The film starts with a manager Nicholas Bell g...,[The film starts with a manager Nicholas Bell ...
3,3630_4,0,It must be assumed that those who praised this...,It must be assumed that those who praised this...,[It must be assumed that those who praised thi...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,Superbly trashy and wondrously unpretentious 8...,[Superbly trashy and wondrously unpretentious ...


In [25]:
data_sentences=df['review_sent'].loc[0:10]
data_sentences.to_json('SENTENCES.json')

### 3.WORD TOKENIZING AND SAVING NOUNS IN SEPERATE JSON FILE

In [26]:
from nltk import word_tokenize

In [27]:
def div2word(text):
    word_text=nltk.word_tokenize(str(text))
    return (word_text)

In [28]:
is_noun = lambda pos: pos[:2] == 'NN'

In [29]:
def findnouns(text):
    word_text=div2word(text)
    nouns = [word for (word, pos) in nltk.pos_tag(word_text) if is_noun(pos)]
    return (nouns)

In [30]:
data_nouns=df['review_sent'].loc[0:10].apply(lambda x: findnouns(x))
data_nouns.to_json('NOUNS.json')

### 4.SAVING VERBS IN A SEPERATE FILE

In [31]:
is_verb = lambda pos: pos[:2] == 'VB'

In [32]:
def findverbs(text):
    word_text=div2word(text)
    verbs = [word for (word, pos) in nltk.pos_tag(word_text) if is_verb(pos)]
    return (verbs)

In [33]:
data_verbs=df['review_sent'].loc[0:10].apply(lambda x: findverbs(x))
data_verbs.to_json('VERBS.json')

### 5. REMOVING STOPWORDS AND SAVING ADJECTIVES IN A SEPERATE JSON FILE

In [77]:
import re

def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

df['review_tokenized'] = df['review_clean'].apply(lambda x: tokenize(x))

stopwords= nltk.corpus.stopwords.words('english')

def rmvstpwrds(text):
    text_without_stopwords= [word for word in text if word not in stopwords]
    return (text_without_stopwords)


In [55]:
df['review_without_stopwords']=df['review_tokenized'].apply(lambda x: rmvstpwrds(x))
df.head(10)

Unnamed: 0,id,sentiment,review,review_clean,review_sent,review_without_stopwords,review_tokenized
0,5814_8,1,With all this stuff going down at the moment w...,With all this stuff going down at the moment w...,[With all this stuff going down at the moment ...,"[With, stuff, going, moment, MJ, ive, started,...","[With, all, this, stuff, going, down, at, the,..."
1,2381_9,1,The Classic War of the Worlds by Timothy Hines...,The Classic War of the Worlds by Timothy Hines...,[The Classic War of the Worlds by Timothy Hine...,"[The, Classic, War, Worlds, Timothy, Hines, en...","[The, Classic, War, of, the, Worlds, by, Timot..."
2,7759_3,0,The film starts with a manager Nicholas Bell g...,The film starts with a manager Nicholas Bell g...,[The film starts with a manager Nicholas Bell ...,"[The, film, starts, manager, Nicholas, Bell, g...","[The, film, starts, with, a, manager, Nicholas..."
3,3630_4,0,It must be assumed that those who praised this...,It must be assumed that those who praised this...,[It must be assumed that those who praised thi...,"[It, must, assumed, praised, film, greatest, f...","[It, must, be, assumed, that, those, who, prai..."
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,Superbly trashy and wondrously unpretentious 8...,[Superbly trashy and wondrously unpretentious ...,"[Superbly, trashy, wondrously, unpretentious, ...","[Superbly, trashy, and, wondrously, unpretenti..."
5,8196_8,1,I dont know why people think this is such a ba...,I dont know why people think this is such a ba...,[I dont know why people think this is such a b...,"[I, dont, know, people, think, bad, movie, Its...","[I, dont, know, why, people, think, this, is, ..."
6,7166_2,0,This movie could have been very good but comes...,This movie could have been very good but comes...,[This movie could have been very good but come...,"[This, movie, could, good, comes, way, short, ...","[This, movie, could, have, been, very, good, b..."
7,10633_1,0,I watched this video at a friends house. Im gl...,I watched this video at a friends house. Im gl...,"[I watched this video at a friends house., Im ...","[I, watched, video, friends, house, Im, glad, ...","[I, watched, this, video, at, a, friends, hous..."
8,319_1,0,A friend of mine bought this film for £1 and e...,A friend of mine bought this film for £1 and e...,[A friend of mine bought this film for £1 and ...,"[A, friend, mine, bought, film, 1, even, gross...","[A, friend, of, mine, bought, this, film, for,..."
9,8713_10,1,br br This movie is full of references. Like M...,br br This movie is full of references. Like M...,"[br br This movie is full of references., Like...","[br, br, This, movie, full, references, Like, ...","[br, br, This, movie, is, full, of, references..."


In [56]:
is_adjec = lambda pos: pos == 'JJ'
def findadjecs(text):
    word_text=div2word(text)
    adjecs = [word for (word, pos) in nltk.pos_tag(word_text) if is_adjec(pos)]
    return (adjecs)

In [60]:
data_adjectives=df['review_without_stopwords'].loc[0:100].apply(lambda x: findadjecs(x))
data_adjectives.to_json('ADJECTIVES.json')

### 6.STEMMING AND SAVING IN A SEPERATE JSON FILE

In [61]:
PS=nltk.PorterStemmer()

def stem_text(text):
    stemmedtext=[PS.stem(word) for word in text]
    return stemmedtext

In [63]:
data_stemmed=df['review_tokenized'].loc[0:100].apply(lambda x: stem_text(x))
data_stemmed.to_json('STEMMED_WORDS.json')

### 7.LEMMATIZING AND SAVING IN A SEPERATE JSON FILE

In [64]:
wnl=nltk.WordNetLemmatizer()

def lemma_text(text):
    lemmatizedtext=[wnl.lemmatize(word) for word in text]
    return lemmatizedtext

In [65]:
data_lemma=df['review_tokenized'].loc[0:100].apply(lambda x: lemma_text(x))
data_lemma.to_json('LEMMATIZED_WORDS.json')

In [66]:
df['review_LEMMATIZED']=df['review_tokenized'].apply(lambda x: lemma_text(x))

In [68]:
df['review_STEMMED']=df['review_tokenized'].apply(lambda x: stem_text(x))
df.head()

Unnamed: 0,id,sentiment,review,review_clean,review_sent,review_without_stopwords,review_tokenized,review_LEMMATIZED,review_STEMMED
0,5814_8,1,With all this stuff going down at the moment w...,With all this stuff going down at the moment w...,[With all this stuff going down at the moment ...,"[With, stuff, going, moment, MJ, ive, started,...","[With, all, this, stuff, going, down, at, the,...","[With, all, this, stuff, going, down, at, the,...","[with, all, thi, stuff, go, down, at, the, mom..."
1,2381_9,1,The Classic War of the Worlds by Timothy Hines...,The Classic War of the Worlds by Timothy Hines...,[The Classic War of the Worlds by Timothy Hine...,"[The, Classic, War, Worlds, Timothy, Hines, en...","[The, Classic, War, of, the, Worlds, by, Timot...","[The, Classic, War, of, the, Worlds, by, Timot...","[the, classic, war, of, the, world, by, timoth..."
2,7759_3,0,The film starts with a manager Nicholas Bell g...,The film starts with a manager Nicholas Bell g...,[The film starts with a manager Nicholas Bell ...,"[The, film, starts, manager, Nicholas, Bell, g...","[The, film, starts, with, a, manager, Nicholas...","[The, film, start, with, a, manager, Nicholas,...","[the, film, start, with, a, manag, nichola, be..."
3,3630_4,0,It must be assumed that those who praised this...,It must be assumed that those who praised this...,[It must be assumed that those who praised thi...,"[It, must, assumed, praised, film, greatest, f...","[It, must, be, assumed, that, those, who, prai...","[It, must, be, assumed, that, those, who, prai...","[It, must, be, assum, that, those, who, prais,..."
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,Superbly trashy and wondrously unpretentious 8...,[Superbly trashy and wondrously unpretentious ...,"[Superbly, trashy, wondrously, unpretentious, ...","[Superbly, trashy, and, wondrously, unpretenti...","[Superbly, trashy, and, wondrously, unpretenti...","[superbl, trashi, and, wondrous, unpretenti, 8..."


### 8. SUMMARIZING EACH REVIEW AND SAVING IN A SEPERATE JSON FILE

In [71]:
pip install summa

Note: you may need to restart the kernel to use updated packages.


In [72]:
from summa.summarizer import summarize

In [74]:
df['review_summary']=df['review_clean'].apply(lambda x: summarize(x))

In [75]:
data_summary=df['review_summary'].loc[0:100]
data_summary.to_json('REVIEW_SUMMARY.json')

In [76]:
df.head()

Unnamed: 0,id,sentiment,review,review_clean,review_sent,review_without_stopwords,review_tokenized,review_LEMMATIZED,review_STEMMED,review_summary
0,5814_8,1,With all this stuff going down at the moment w...,With all this stuff going down at the moment w...,[With all this stuff going down at the moment ...,"[With, stuff, going, moment, MJ, ive, started,...","[With, all, this, stuff, going, down, at, the,...","[With, all, this, stuff, going, down, at, the,...","[with, all, thi, stuff, go, down, at, the, mom...",Some of it has subtle messages about MJs feeli...
1,2381_9,1,The Classic War of the Worlds by Timothy Hines...,The Classic War of the Worlds by Timothy Hines...,[The Classic War of the Worlds by Timothy Hine...,"[The, Classic, War, Worlds, Timothy, Hines, en...","[The, Classic, War, of, the, Worlds, by, Timot...","[The, Classic, War, of, the, Worlds, by, Timot...","[the, classic, war, of, the, world, by, timoth...",We enjoyed the effort Mr. Hines put into being...
2,7759_3,0,The film starts with a manager Nicholas Bell g...,The film starts with a manager Nicholas Bell g...,[The film starts with a manager Nicholas Bell ...,"[The, film, starts, manager, Nicholas, Bell, g...","[The, film, starts, with, a, manager, Nicholas...","[The, film, start, with, a, manager, Nicholas,...","[the, film, start, with, a, manag, nichola, be...",A secret project mutating a primal animal usin...
3,3630_4,0,It must be assumed that those who praised this...,It must be assumed that those who praised this...,[It must be assumed that those who praised thi...,"[It, must, assumed, praised, film, greatest, f...","[It, must, be, assumed, that, those, who, prai...","[It, must, be, assumed, that, those, who, prai...","[It, must, be, assum, that, those, who, prais,...",
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,Superbly trashy and wondrously unpretentious 8...,[Superbly trashy and wondrously unpretentious ...,"[Superbly, trashy, wondrously, unpretentious, ...","[Superbly, trashy, and, wondrously, unpretenti...","[Superbly, trashy, and, wondrously, unpretenti...","[superbl, trashi, and, wondrous, unpretenti, 8...",Superbly trashy and wondrously unpretentious 8...
