In [30]:
import nltk
nltk.download('stopwords') 
nltk.download('wordnet')
nltk.download('movie_reviews')
nltk.download('punkt') 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [53]:
import pandas as pd
from nltk.corpus import movie_reviews, stopwords
reviews = []
for fileid in movie_reviews.fileids():
    tag, filename = fileid.split('/')
    reviews.append((tag, movie_reviews.raw(fileid)))
sample = pd.DataFrame(reviews, columns=['target', 'document'])
print(f'Dimensions: {sample.shape}')
sample.head()

Dimensions: (2000, 2)


Unnamed: 0,target,document
0,neg,"plot : two teen couples go to a church party ,..."
1,neg,the happy bastard's quick movie review \ndamn ...
2,neg,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs..."
4,neg,synopsis : a mentally unstable man undergoing ...


In [3]:
sample.tail()

Unnamed: 0,target,document
1995,pos,wow ! what a movie . \nit's everything a movie...
1996,pos,"richard gere can be a commanding actor , but h..."
1997,pos,"glory--starring matthew broderick , denzel was..."
1998,pos,steven spielberg's second epic film on world w...
1999,pos,"truman ( "" true-man "" ) burbank is the perfect..."


In [4]:
sample['target'].value_counts()

pos    1000
neg    1000
Name: target, dtype: int64

# ข้อ 1.
จงเขียนโปรแกรมเพื่อทำการตัดคำ ด้วย word_tokenize โดยสร้างเป็น column ใหม่ชื่อ word_token 

In [58]:
from nltk.tokenize import word_tokenize
word_token = sample['document']
word_token = word_token.apply(word_tokenize)
sample = sample.assign(word_token=word_token.values)
sample

Unnamed: 0,target,document,word_token
0,neg,"plot : two teen couples go to a church party ,...","[plot, :, two, teen, couples, go, to, a, churc..."
1,neg,the happy bastard's quick movie review \ndamn ...,"[the, happy, bastard, 's, quick, movie, review..."
2,neg,it is movies like these that make a jaded movi...,"[it, is, movies, like, these, that, make, a, j..."
3,neg,""" quest for camelot "" is warner bros . ' firs...","[``, quest, for, camelot, ``, is, warner, bros..."
4,neg,synopsis : a mentally unstable man undergoing ...,"[synopsis, :, a, mentally, unstable, man, unde..."
...,...,...,...
1995,pos,wow ! what a movie . \nit's everything a movie...,"[wow, !, what, a, movie, ., it, 's, everything..."
1996,pos,"richard gere can be a commanding actor , but h...","[richard, gere, can, be, a, commanding, actor,..."
1997,pos,"glory--starring matthew broderick , denzel was...","[glory, --, starring, matthew, broderick, ,, d..."
1998,pos,steven spielberg's second epic film on world w...,"[steven, spielberg, 's, second, epic, film, on..."


# ข้อ 2
จงเขียนโปรแกรมเพื่อทำ stemming จากคำที่ได้จากคำที่ได้จาก column word_token โดยสร้างเป็น column ใหม่ชื่อ stemmed 

In [59]:
from nltk.stem import PorterStemmer 
def stemming(sent):
  ps = PorterStemmer()
  ps_sent = [ps.stem(words_sent) for words_sent in sent]
  return ps_sent
sample['Stemmed'] = sample['word_token'].apply(stemming)
sample

Unnamed: 0,target,document,word_token,Stemmed
0,neg,"plot : two teen couples go to a church party ,...","[plot, :, two, teen, couples, go, to, a, churc...","[plot, :, two, teen, coupl, go, to, a, church,..."
1,neg,the happy bastard's quick movie review \ndamn ...,"[the, happy, bastard, 's, quick, movie, review...","[the, happi, bastard, 's, quick, movi, review,..."
2,neg,it is movies like these that make a jaded movi...,"[it, is, movies, like, these, that, make, a, j...","[it, is, movi, like, these, that, make, a, jad..."
3,neg,""" quest for camelot "" is warner bros . ' firs...","[``, quest, for, camelot, ``, is, warner, bros...","[``, quest, for, camelot, ``, is, warner, bro,..."
4,neg,synopsis : a mentally unstable man undergoing ...,"[synopsis, :, a, mentally, unstable, man, unde...","[synopsi, :, a, mental, unstabl, man, undergo,..."
...,...,...,...,...
1995,pos,wow ! what a movie . \nit's everything a movie...,"[wow, !, what, a, movie, ., it, 's, everything...","[wow, !, what, a, movi, ., it, 's, everyth, a,..."
1996,pos,"richard gere can be a commanding actor , but h...","[richard, gere, can, be, a, commanding, actor,...","[richard, gere, can, be, a, command, actor, ,,..."
1997,pos,"glory--starring matthew broderick , denzel was...","[glory, --, starring, matthew, broderick, ,, d...","[glori, --, star, matthew, broderick, ,, denze..."
1998,pos,steven spielberg's second epic film on world w...,"[steven, spielberg, 's, second, epic, film, on...","[steven, spielberg, 's, second, epic, film, on..."


# ข้อ 3
จงเขียนโปรแกรมเพื่อตัด stopword จากคำที่ได้จาก column stemmed โดยสร้างเป็น column ใหม่ชื่อ stopword 

In [37]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

In [63]:
def getstop(Name):
  stop_words = set(stopwords.words('english')) 
  filtered_sentence = [] 
  for w in Name: 
    if w not in stop_words: 
        filtered_sentence.append(w) 
  return  filtered_sentence

sample["stop_words"]  = sample['Stemmed'].apply(getstop)
sample

Unnamed: 0,target,document,word_token,Stemmed,stop_words
0,neg,"plot : two teen couples go to a church party ,...","[plot, :, two, teen, couples, go, to, a, churc...","[plot, :, two, teen, coupl, go, to, a, church,...","[plot, :, two, teen, coupl, go, church, parti,..."
1,neg,the happy bastard's quick movie review \ndamn ...,"[the, happy, bastard, 's, quick, movie, review...","[the, happi, bastard, 's, quick, movi, review,...","[happi, bastard, 's, quick, movi, review, damn..."
2,neg,it is movies like these that make a jaded movi...,"[it, is, movies, like, these, that, make, a, j...","[it, is, movi, like, these, that, make, a, jad...","[movi, like, make, jade, movi, viewer, thank, ..."
3,neg,""" quest for camelot "" is warner bros . ' firs...","[``, quest, for, camelot, ``, is, warner, bros...","[``, quest, for, camelot, ``, is, warner, bro,...","[``, quest, camelot, ``, warner, bro, ., ', fi..."
4,neg,synopsis : a mentally unstable man undergoing ...,"[synopsis, :, a, mentally, unstable, man, unde...","[synopsi, :, a, mental, unstabl, man, undergo,...","[synopsi, :, mental, unstabl, man, undergo, ps..."
...,...,...,...,...,...
1995,pos,wow ! what a movie . \nit's everything a movie...,"[wow, !, what, a, movie, ., it, 's, everything...","[wow, !, what, a, movi, ., it, 's, everyth, a,...","[wow, !, movi, ., 's, everyth, movi, :, funni,..."
1996,pos,"richard gere can be a commanding actor , but h...","[richard, gere, can, be, a, commanding, actor,...","[richard, gere, can, be, a, command, actor, ,,...","[richard, gere, command, actor, ,, 's, alway, ..."
1997,pos,"glory--starring matthew broderick , denzel was...","[glory, --, starring, matthew, broderick, ,, d...","[glori, --, star, matthew, broderick, ,, denze...","[glori, --, star, matthew, broderick, ,, denze..."
1998,pos,steven spielberg's second epic film on world w...,"[steven, spielberg, 's, second, epic, film, on...","[steven, spielberg, 's, second, epic, film, on...","[steven, spielberg, 's, second, epic, film, wo..."
