In [151]:
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve
from sklearn.cross_validation import train_test_split, StratifiedKFold, cross_val_score
import nltk
from nltk.corpus import stopwords
import re

In [None]:
labeled = pd.read_csv("labeledTrainData.tsv", header=0, delimiter="\t", quoting=2)
test = pd.read_csv("testData.tsv", header=0, delimiter="\t", quoting=2)
unlabeled = pd.read_csv("unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3)

In [112]:
from nltk.stem import WordNetLemmatizer
stemmer = WordNetLemmatizer()

def review_to_words( raw_review ):
    review_text = BeautifulSoup(raw_review, "lxml").get_text() 
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    words = letters_only.lower().split()                             
    meaningful_words = map(lambda x: stemmer.lemmatize(x), words)
    return( " ".join( meaningful_words ))   

In [113]:
labeled_reviews = []
for review in labeled['review']:
    labeled_reviews.append(review_to_words(review))

In [7]:
unlabeled_reviews = []
for review in unlabeled['review']:
    unlabeled_reviews.append(review_to_words(review))

In [115]:
test_reviews = []
for review in test['review']:
    test_reviews.append(review_to_words(review))

In [144]:
vectorizer = CountVectorizer(max_features = 5000)
labeled_reviews_features = vectorizer.fit_transform(labeled_reviews).toarray()

In [146]:
vectorizer = CountVectorizer(max_features = 5000)
test_reviews_features = vectorizer.fit_transform(test_reviews).toarray()

In [148]:
lr_model = LogisticRegression()
lr_model.fit(labeled_reviews_features, labeled["sentiment"])
pred = lr_model.predict(test_reviews_features)

In [149]:
result_data = zip(test['id'], pred)
pd.DataFrame(result_data, columns=['id', 'sentiment']).to_csv('submit-log.csv', index=False)

In [1]:
import pip
installed_packages = pip.get_installed_distributions()
installed_packages_list = sorted(["%s==%s" % (i.key, i.version)
     for i in installed_packages])
print(installed_packages_list)


['abstract-rendering==0.5.1', 'alabaster==0.7.7', 'anaconda-client==1.2.2', 'appnope==0.1.0', 'appscript==1.0.1', 'argcomplete==1.0.0', 'astropy==1.1.1', 'babel==2.2.0', 'backports-abc==0.4', 'backports.ssl-match-hostname==3.4.0.2', 'beautifulsoup4==4.4.1', 'bitarray==0.8.1', 'blaze==0.9.0', 'bokeh==0.11.0', 'boto==2.39.0', 'bottleneck==1.0.0', 'bz2file==0.98', 'cdecimal==2.3', 'cffi==1.2.1', 'clyent==1.2.0', 'colorama==0.3.6', 'conda-build==1.19.0', 'conda-env==2.4.5', 'conda==4.0.5', 'configobj==5.0.6', 'cryptography==1.0.2', 'cycler==0.9.0', 'cython==0.23.4', 'cytoolz==0.7.5', 'datashape==0.5.0', 'dawg-python==0.7.2', 'decorator==4.0.9', 'docopt==0.6.2', 'docutils==0.12', 'dynd==f641248', 'enum34==1.1.2', 'et-xmlfile==1.0.1', 'fastcache==1.0.2', 'flask==0.10.1', 'funcsigs==0.4', 'futures==3.0.3', 'gensim==0.12.4', 'gevent-websocket==0.9.5', 'gevent==1.0.2', 'greenlet==0.4.9', 'grin==1.2.1', 'h5py==2.5.0', 'happybase==0.9', 'httpretty==0.8.14', 'idna==2.0', 'ipaddress==1.0.14', 'ipyk

In [45]:
forest = RandomForestClassifier(n_estimators = 100) 
forest = forest.fit( labeled_reviews_features, labeled["sentiment"])
result_forest = forest.predict(test_reviews_features)

In [81]:
result_data = zip(test['id'], result_forest)

In [None]:
pd.DataFrame(result_data, columns=['id', 'sentiment']).to_csv('submit_forest.csv', index=False)

In [46]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(labeled_reviews_features, labeled["sentiment"])
result_lr = lr.predict(test_reviews_features)
result_data = zip(test['id'], result_lr)
pd.DataFrame(result_data, columns=['id', 'sentiment']).to_csv('submit_lr.csv', index=False)

In [47]:
vectorizer = CountVectorizer(ngram_range=[1,3], max_features = 10000)
labeled_reviews_dirty = vectorizer.fit_transform(labeled['review']).toarray()
test_reviews_dirty = vectorizer.fit_transform(test['review']).toarray()

In [48]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(labeled_reviews_dirty, labeled["sentiment"])
result_lr = lr.predict(test_reviews_dirty)
result_data = zip(test['id'], result_lr)
pd.DataFrame(result_data, columns=['id', 'sentiment']).to_csv('submit-lr-dirty.csv', index=False)

In [49]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# Define a function to split a review into parsed sentences
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( review_to_wordlist( raw_sentence, \
              remove_stopwords ))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences


In [50]:
test.shape

(25000, 2)