# Kaggle: Bags of Worlds

In [1]:
import graphlab as gl
from IPython.display import display
from IPython.display import Image

gl.canvas.set_target('ipynb')

# Load Data

In [2]:
traindata_path = "/Users/marvinbertin/graphlab_data/Bags-of-Popcorn/LabeledData/labeledTrainData.tsv"
testdata_path = "/Users/marvinbertin/graphlab_data/Bags-of-Popcorn/LabeledData/testData.tsv"

In [5]:
movie_review = gl.SFrame.read_csv(traindata_path, header = True, delimiter='\t', quote_char = '"',
                                column_type_hints={'id':str, 'sentiment':int, 'review':str})

PROGRESS: Finished parsing file /Users/marvinbertin/graphlab_data/Bags-of-Popcorn/LabeledData/labeledTrainData.tsv
PROGRESS: Parsing completed. Parsed 100 lines in 0.499896 secs.
PROGRESS: Finished parsing file /Users/marvinbertin/graphlab_data/Bags-of-Popcorn/LabeledData/labeledTrainData.tsv
PROGRESS: Parsing completed. Parsed 25000 lines in 0.889023 secs.


In [6]:
movie_review

id,sentiment,review
5814_8,1,With all this stuff going down at the moment with ...
2381_9,1,"""The Classic War of the Worlds"" by Timothy Hines ..."
7759_3,0,The film starts with a manager (Nicholas Bell) ...
3630_4,0,It must be assumed that those who praised this ...
9495_8,1,Superbly trashy and wondrously unpretentious ...
8196_8,1,I dont know why people think this is such a bad ...
7166_2,0,"This movie could have been very good, but c ..."
10633_1,0,I watched this video at a friend's house. I'm glad ...
319_1,0,"A friend of mine bought this film for £1, and ..."
8713_10,1,<br /><br />This movie is full of references. Like ...


# Data Exploration

In [7]:
movie_review.show()

# Example of a Movie Review

In [8]:
movie_review[0]

{'id': '5814_8',
 'review': "With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature 

# Feature Engineering

In [9]:
reviewPermovie = movie_review.groupby('id', {'reviews': gl.aggregate.CONCAT('review')})

In [10]:
reviewPermovie.add_column(reviewPermovie['reviews'].apply(len), name = 'num_reviews')

id,reviews,num_reviews
1241_7,"[This movie is very entertaining, and any ...",1
8892_2,"[Have to admit, this version disgraces ...",1
9090_1,"[A propaganda film for the Palestinian ""cause"". ...",1
1849_7,"[""Antwone Fisher"" tells of a young black U.S. ...",1
8504_8,"[You play as B.J. Blazkowicz, a US secret ...",1
12033_8,"[When I was flicking through the TV Guide, ...",1
7733_9,[Foley's noir quality in this saturated and ...,1
1641_1,[This film is not funny. It is not entertaining. ...,1
9234_4,"[Remade today, this film would be a very creepy, ...",1
5586_4,[Man To Man tries hard to be a good movie: it has ...,1


In [30]:
# turns out there is just one review per movie ID
sum(reviewPermovie['id'] == '1241_7')

1

## Unigram

In [13]:
movie_review['1gram'] = gl.text_analytics.count_ngrams(movie_review['review'], n=1)

In [14]:
# 24,932 words were reduced to 76,058 unigrams
movie_review[['review', '1gram']].show()

# Split Data into Test Train

In [33]:
train_set, test_set = movie_review.random_split(0.8, seed=5)

# Unigram Sentiment Classifier

In [34]:
model_1 = gl.classifier.create(train_set, target='sentiment', features=['1gram'])

PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.

PROGRESS: The following methods are available for this type of problem.
PROGRESS: LogisticClassifier, SVMClassifier
PROGRESS: The returned model will be chosen according to validation accuracy.
PROGRESS: Logistic regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 19074
PROGRESS: Number of classes           : 2
PROGRESS: Number of feature columns   : 1
PROGRESS: Number of unpacked features : 68091
PROGRESS: Number of coefficients    : 68092
PROGRESS: Starting L-BFGS
PROGRESS: --------------------------------------------------------
PROGRESS: +-----------+----------+-----------+--------------+-------------------+---------------------+
PROGRESS: | Iteration | Passes   | Step size | Elapsed Time | Training-accuracy | Validation-accuracy |
PROGRESS: +-----

In [35]:
result1 = model_1.evaluate(test_set)

In [37]:
# Scores are already pretty good with a simple unigram
result1

{'accuracy': 0.8725091500610004, 'confusion_matrix': Columns:
 	target_label	int
 	predicted_label	int
 	count	int
 
 Rows: 4
 
 Data:
 +--------------+-----------------+-------+
 | target_label | predicted_label | count |
 +--------------+-----------------+-------+
 |      0       |        1        |  378  |
 |      1       |        0        |  249  |
 |      1       |        1        |  2144 |
 |      0       |        0        |  2147 |
 +--------------+-----------------+-------+
 [4 rows x 3 columns], 'f1_score': 0.8724313326551374, 'precision': 0.8501189532117367, 'recall': 0.8959465106560802}

# Bigram

In [40]:
movie_review['2gram'] = gl.text_analytics.count_ngrams(movie_review['review'], n=2)

In [46]:
movie_review.head(1)

id,sentiment,review,1gram,2gram
5814_8,1,With all this stuff going down at the moment with ...,"{'all': 4, 'moonwalker': 2, 'just': 3, 'dance' ...","{'kiddy bad': 1, 'true is': 1, 'started ..."


# Unigram & Bigram Sentiment Classifier

In [47]:
train_set, test_set = movie_review.random_split(0.8, seed=5)
model_2 = gl.classifier.create(train_set, target = 'sentiment', features = ["1gram", "2gram"])
result2 = model_2.evaluate(test_set)
result2

PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.

PROGRESS: The following methods are available for this type of problem.
PROGRESS: LogisticClassifier, SVMClassifier
PROGRESS: The returned model will be chosen according to validation accuracy.
PROGRESS: Logistic regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 19039
PROGRESS: Number of classes           : 2
PROGRESS: Number of feature columns   : 2
PROGRESS: Number of unpacked features : 1206795
PROGRESS: Number of coefficients    : 1206796
PROGRESS: Starting L-BFGS
PROGRESS: --------------------------------------------------------
PROGRESS: +-----------+----------+-----------+--------------+-------------------+---------------------+
PROGRESS: | Iteration | Passes   | Step size | Elapsed Time | Training-accuracy | Validation-accuracy |
PROGRESS: +-

{'accuracy': 0.8802358682391216,
 'auc': 0.9394204217747356,
 'confusion_matrix': Columns:
 	target_label	int
 	predicted_label	int
 	count	int
 
 Rows: 4
 
 Data:
 +--------------+-----------------+-------+
 | target_label | predicted_label | count |
 +--------------+-----------------+-------+
 |      0       |        1        |  390  |
 |      1       |        0        |  199  |
 |      1       |        1        |  2194 |
 |      0       |        0        |  2135 |
 +--------------+-----------------+-------+
 [4 rows x 3 columns],
 'f1_score': 0.881655615832831,
 'log_loss': 0.6130621038247112,
 'precision': 0.8490712074303406,
 'recall': 0.9168407856247388,
 'roc_curve': Columns:
 	threshold	float
 	fpr	float
 	tpr	float
 	p	int
 	n	int
 
 Rows: 100001
 
 Data:
 +-----------+----------------+----------------+------+------+
 | threshold |      fpr       |      tpr       |  p   |  n   |
 +-----------+----------------+----------------+------+------+
 |    0.0    |      1.0       |     

In [32]:
#creating classifier using all 25,000 reviews
train_data = gl.SFrame.read_csv(traindata_path,header=True, delimiter='\t',quote_char='"',
                                column_type_hints = {'id':str, 'sentiment' : int, 'review':str } )
train_data['1grams features'] = gl.text_analytics.count_ngrams(train_data['review'],1)
train_data['2grams features'] = gl.text_analytics.count_ngrams(train_data['review'],2)

PROGRESS: Finished parsing file /Users/marvinbertin/graphlab_data/Bags-of-Popcorn/labeledTrainData.tsv
PROGRESS: Parsing completed. Parsed 100 lines in 0.548694 secs.
PROGRESS: Finished parsing file /Users/marvinbertin/graphlab_data/Bags-of-Popcorn/labeledTrainData.tsv
PROGRESS: Parsing completed. Parsed 25000 lines in 0.949808 secs.


In [33]:
cls = gl.classifier.create(train_data, target='sentiment', features=['1grams features','2grams features'])

PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.

PROGRESS: The following methods are available for this type of problem.
PROGRESS: LogisticClassifier, SVMClassifier
PROGRESS: The returned model will be chosen according to validation accuracy.
PROGRESS: Logistic regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 23766
PROGRESS: Number of classes           : 2
PROGRESS: Number of feature columns   : 2
PROGRESS: Number of unpacked features : 1405231
PROGRESS: Number of coefficients    : 1405232
PROGRESS: Starting L-BFGS
PROGRESS: --------------------------------------------------------
PROGRESS: +-----------+----------+-----------+--------------+-------------------+---------------------+
PROGRESS: | Iteration | Passes   | Step size | Elapsed Time | Training-accuracy | Validation-accuracy |
PROGRESS: +-

In [34]:
#creating the test dataset
test_data = gl.SFrame.read_csv(testdata_path,header=True, delimiter='\t',quote_char='"',
                               column_type_hints = {'id':str, 'review':str } )
test_data['1grams features'] = gl.text_analytics.count_ngrams(test_data['review'],1)
test_data['2grams features'] = gl.text_analytics.count_ngrams(test_data['review'],2)

PROGRESS: Finished parsing file /Users/marvinbertin/graphlab_data/Bags-of-Popcorn/testData.tsv
PROGRESS: Parsing completed. Parsed 100 lines in 0.50204 secs.
PROGRESS: Finished parsing file /Users/marvinbertin/graphlab_data/Bags-of-Popcorn/testData.tsv
PROGRESS: Parsing completed. Parsed 25000 lines in 0.927375 secs.


In [37]:
#predicting the sentiment of each review in the test dataset
test_data['sentiment'] = cls.classify(test_data)['class'].astype(int)
test_data

id,review,1grams features,2grams features,sentiment
12311_10,Naturally in a film who's main themes are of ...,"{'show': 4, 'themes': 1, 'fear': 1, 'perfect': 1, ...","{'is rated': 1, 'ones however': 1, 'and ...",1
8348_2,This movie is a disaster within a disaster film. ...,"{'all': 2, 'just': 1, 'tv': 1, 'actually': 2, ...","{'if you': 1, 'all sense': 1, 'meaningful ...",0
5828_4,"All in all, this is a movie for kids. We sa ...","{'all': 2, 'just': 1, 'wishing': 1, 'go': 1, ...","{'was twice': 1, 'was so': 1, 'great that': 1, ...",1
7186_2,Afraid of the Dark left me with the impression ...,"{'and': 4, 'impression': 1, 'all': 1, ...","{'i wanted': 1, 'mediocrity had': 1, ...",0
12128_7,A very accurate depiction of small time mob life ...,"{'being': 1, 'accurate': 1, 'years': 1, 'mob': 1, ...","{'if you': 1, 'depiction of': 1, 'but these': 1, ...",1
2913_8,"...as valuable as King Tut's tomb! (OK, maybe ...","{'less': 1, 'being': 1, 'valuable': 2, 'years': ...","{'if you': 1, 'a film': 1, 'but it': 1, 'at t ...",1
4396_1,This has to be one of the biggest misfires ...,"{'and': 5, 'would': 2, 'spoil': 1, 'just': 1, ...","{'by the': 1, 'if i': 1, 'movie a': 1, 'it wou ...",0
395_2,"This is one of those movies I watched, and ...","{'just': 1, 'being': 1, 'over': 1, 'dollar': 1, ...","{'job she': 1, 'nor have': 1, 'll screw': 1, ...",0
10616_1,The worst movie i've seen in years (and i've se ...,"{'and': 1, 'this': 2, 'disgrace': 1, 've': 2, ...","{'movie i': 1, 'they recommended': 1, 'piece ...",0
9074_9,"Five medical students (Kevin Bacon, David ...","{'all': 2, 'sci': 1, 'they': 5, 'being': 2, ...","{'that director': 1, 'actors make': 1, 'the ...",1


In [39]:
#saving the prediction to a CSV for submission
test_data[['id', 'sentiment']].save("/Users/marvinbertin/graphlab_data/Bags-of-Popcorn/pred1.csv", format="csv")