In [43]:
import pandas as pd
import re
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer

train = pd.read_csv('./labeledTrainData.tsv', delimiter='\t', header=0, quoting=3)

In [44]:
def raw_review_to_words(text):
    """
    Function to convert a raw review to a list of words,
    handling punctuation, numbers, HTML tags and markup, and
    stop words.
    """
    review_text = BeautifulSoup(text).get_text()
    numbers_replaced = re.sub("[0-9]", "NUM", review_text)
    punctuation_replaced = re.sub("[^a-zA-Z]", " ", numbers_replaced)
    lower_case = punctuation_replaced.lower()
    words = lower_case.split()
    stop_words = set(stopwords.words("english"))
    meaningful_words = [w for w in words if w not in stop_words]
    return ' '.join(meaningful_words)

def train_model(X, y):
    """
    Train (Random Forest) model
    """
    model = RandomForestClassifier(n_estimators=100)
    model = model.fit(X, y)    
    return model

def generate_submission(filename):
    """
    Generate Kaggle submission file in *.csv format
    """
    train['review_clean'] = train['review'].apply(raw_review_to_words)

    vectorizer = CountVectorizer(max_features=5000)
    train_data_features = vectorizer.fit_transform(train['review_clean'])
    train_data_features = train_data_features.toarray() 
    
    model = train_model(train_data_features, train['sentiment'])
    
    test = pd.read_csv("testData.tsv", header=0, delimiter="\t", quoting=3)
    test['review_clean'] = test['review'].apply(raw_review_to_words)
    
    test_data_features = vectorizer.transform(test['review_clean'])
    test_data_features = test_data_features.toarray()
    
    results = model.predict(test_data_features)
    submission = pd.DataFrame({'id': test['id'], 'sentiment': results})
    submission.to_csv(filename, index=False, quoting=3)
    
    return submission

generate_submission('submission.csv')



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


Unnamed: 0,id,sentiment
0,"""12311_10""",1
1,"""8348_2""",0
2,"""5828_4""",1
3,"""7186_2""",1
4,"""12128_7""",1
5,"""2913_8""",0
6,"""4396_1""",0
7,"""395_2""",1
8,"""10616_1""",0
9,"""9074_9""",1
