In [123]:
# Import Files
import pandas as pd
import numpy as np
import nltk
import string

# Download files
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/faculty/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/faculty/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [143]:
# Read the data (Pandas)

reviews = pd.read_csv('/project/weaklabel_sampledata.csv', header=0, encoding='unicode_escape', nrows=1000)
reviews.head()

Unnamed: 0,Freshness,Review
0,0,Parental Content Review
1,1,Director Wayne Wang proves with Maid in Manha...
2,0,the lack of dramatic development doesn't leav...
3,1,A riveting documentary that explains educatio...
4,0,"The problem with The Informant!, aside from t..."


In [145]:
# (Pandas)

# Turn 'Review' column into list
reviews_list = reviews['Review'].astype(str).tolist()

# Clean the data and tokenize it
reviews_string = ''
reviews_string = reviews_string.join(reviews_list).lower()
reviews_string = reviews_string.replace('"', '').replace("'", '').replace('\n','').replace(',','').replace('[','').replace(']','')
tokens = reviews_string.split()

# Check how many words we have
len(tokens)

107726

In [146]:
from nltk.corpus import stopwords

# Remove punctuation from each token
table = str.maketrans('', '', string.punctuation)
tokens = [w.translate(table) for w in tokens]

# Remove remaining tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]

# Filter out stop words
stop_words = set(stopwords.words('english'))
tokens = [w for w in tokens if not w in stop_words]

# Filter out short tokens
tokens = [word for word in tokens if len(word) > 1]

len(tokens)

60915

In [147]:
from operator import itemgetter
from collections import Counter

# Count how many times each word appears
count = Counter(tokens).items()
sorted_count = sorted(count, key = itemgetter(1))
sorted_count.reverse()

In [148]:
from nltk import LancasterStemmer

# initialize Lancaster Stemmer
LS = LancasterStemmer()
lemmatized = []
for l in tokens: lemmatized.append(LS.stem(l))

# Count how many times each word appears
count = Counter(lemmatized).items()
sorted_count = sorted(count, key = itemgetter(1))
sorted_count.reverse()

# Select 5000 most frequent words
top5000 = [i[0] for i in sorted_count[:5000]]

In [149]:
from nltk.tokenize import word_tokenize

review = []

for sentence in reviews_list :
    sentence = sentence.lower()
    sentence = sentence.replace('.', '').replace("'", '').replace('\n','').replace(',','')
    token_sentence = word_tokenize(sentence)
    
    token_words = []
    for token_word in token_sentence:
        token_word = LS.stem(token_word)
        token_words.append(token_word)
    review.append(token_words)
len(review)

5000

In [150]:
# (Pandas)

word_matrix = []

for i in review: word_matrix.append([1 if j in i else 0 for j in top5000])
features = pd.DataFrame(word_matrix, columns = top5000, index = reviews.index)
features['freshness']=reviews['Freshness']

# Sanity Check
features

Unnamed: 0,film,movy,lik,on,mak,act,ev,story,charact,feel,...,showbo,scrappy,withinsweaty,minorkey,glasss,medy,yel,denzel,behemo,freshness
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4997,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [151]:

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

train, test = train_test_split(features, test_size = 0.1)

cols = train.columns[:-1]

lr = LogisticRegression()
gnb = MultinomialNB()

models = [lr,gnb]

for model in models:
    model.fit(train[cols], train['freshness'])
    y_pred = model.predict(test[cols])

    print("Number of mislabeled points out of a total {} points : {}, performance {:05.2f}%"
          .format(test.shape[0], (test["freshness"] != y_pred).sum(),
                  100*(1-(test["freshness"] != y_pred).sum()/test.shape[0]))
         )

Number of mislabeled points out of a total 500 points : 172, performance 65.60%
Number of mislabeled points out of a total 500 points : 156, performance 68.80%


In [144]:
# Read the data (Pandas)
reviews = pd.read_csv('/project/weaklabel_sampledata.csv', header=0, encoding='unicode_escape', nrows=5000)

# Turn 'Review' column into list
reviews_list = reviews['Review'].astype(str).tolist()

# Clean the data and tokenize it
reviews_string = ''
reviews_string = reviews_string.join(reviews_list).lower()
reviews_string = reviews_string.replace('"', '').replace("'", '').replace('\n','').replace(',','').replace('[','').replace(']','')
tokens = reviews_string.split()

# Remove punctuation from each token
table = str.maketrans('', '', string.punctuation)
tokens = [w.translate(table) for w in tokens]

# Remove remaining tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]

# Filter out stop words
stop_words = set(stopwords.words('english'))
tokens = [w for w in tokens if not w in stop_words]

# Filter out short tokens
tokens = [word for word in tokens if len(word) > 1]

# Count how many times each word appears
count = Counter(tokens).items()
sorted_count = sorted(count, key = itemgetter(1))
sorted_count.reverse()

lemmatized = []
for l in tokens: lemmatized.append(LS.stem(l))

# Count how many times each word appears
count = Counter(lemmatized).items()
sorted_count = sorted(count, key = itemgetter(1))
sorted_count.reverse()

# Select 5000 most frequent words
top5000 = [i[0] for i in sorted_count[:5000]]

review = []

for sentence in reviews_list :
    sentence = sentence.lower()
    sentence = sentence.replace('.', '').replace("'", '').replace('\n','').replace(',','')
    token_sentence = word_tokenize(sentence)
    
    token_words = []
    for token_word in token_sentence:
        token_word = LS.stem(token_word)
        token_words.append(token_word)
    review.append(token_words)

word_matrix = []

for i in review: word_matrix.append([1 if j in i else 0 for j in top5000])
features = pd.DataFrame(word_matrix, columns = top5000, index = reviews.index)
features['freshness']=reviews['Freshness']    

train, test = train_test_split(features, test_size = 0.1)

cols = train.columns[:-1]

lr = LogisticRegression()
gnb = MultinomialNB()

models = [lr,gnb]

for model in models:
    model.fit(train[cols], train['freshness'])
    y_pred = model.predict(test[cols])

    print("Number of mislabeled points out of a total {} points : {}, performance {:05.2f}%"
          .format(test.shape[0], (test["freshness"] != y_pred).sum(),
                  100*(1-(test["freshness"] != y_pred).sum()/test.shape[0])))

Number of mislabeled points out of a total 500 points : 153, performance 69.40%
Number of mislabeled points out of a total 500 points : 148, performance 70.40%
