### 1. load Dataset(train, test)

In [1]:
train = pd.read_csv("../labeledTrainData.tsv", delimiter='\t', quoting=3)
train.tail()

Unnamed: 0,id,sentiment,review
24995,"""3453_3""",0,"""It seems like more consideration has gone int..."
24996,"""5064_1""",0,"""I don't believe they made this film. Complete..."
24997,"""10905_3""",0,"""Guy is a loser. Can't get girls, needs to bui..."
24998,"""10194_3""",0,"""This 30 minute documentary Buñuel made in the..."
24999,"""8478_8""",1,"""I saw this movie as a child and it broke my h..."


In [2]:
test = pd.read_csv("../testData.tsv", delimiter="\t", quoting=3)
test.tail()

Unnamed: 0,id,review
24995,"""2155_10""","""Sony Pictures Classics, I'm looking at you! S..."
24996,"""59_10""","""I always felt that Ms. Merkerson had never go..."
24997,"""2531_1""","""I was so disappointed in this movie. I am ver..."
24998,"""7772_8""","""From the opening sequence, filled with black ..."
24999,"""11465_10""","""This is a great horror film for people who do..."


In [24]:
train['sentiment'].value_counts()

1    12500
0    12500
Name: sentiment, dtype: int64

### 2. Data cleaning and Text preprocessing

In [3]:
# html 태그 제거 by BeautifulSoup
from bs4 import BeautifulSoup

# 특수문자 제거 by 정규표현식(re)
import re

# 어간추출, 형태소 분석 by SnowballStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import PorterStemmer

# 불용어 제거 by stopwords
from nltk.corpus import stopwords

In [4]:
# Preprocessing function 만들기
def review_to_words(raw_review):
    # remove html tag
    review_text = BeautifulSoup(raw_review, 'html.parser').get_text()
    
    # remove special symbols(특수문자 제거)
    letters_only = re.sub('[^a-zA-Z]', " ", review_text)
    
    # 소문자 변환, 토큰화
    words = letters_only.lower().split()
    
    # remove stopwords, list-->set(for speed up)
    stopword = set(stopwords.words('english'))
    meaningful_words = [w for w in words if not w in stopword]
    
    # 어간 추출
    stemmer = PorterStemmer()
    stemming_words = [stemmer.stem(w) for w in meaningful_words]
    
    return " ".join(stemming_words)

In [5]:
train['review_cleaning'] = train['review'].apply(review_to_words)
test['review_cleaning'] = test['review'].apply(review_to_words)

### 3. Vectorize, Bag of Words

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline # for speed up

In [28]:
vectorizer = CountVectorizer(analyzer='word',
                             tokenizer=None,
                             preprocessor = None,
                             stop_words = None,
                             min_df = 2,
                             ngram_range=(1,3),
                             max_features = 20000)
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=20000, min_df=2,
        ngram_range=(1, 3), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [29]:
pipeline = Pipeline([('vect', vectorizer),
                    ])

In [30]:
train_data_feature = pipeline.fit_transform(train['review_cleaning'])
train_data_feature.shape

(25000, 20000)

In [31]:
train_data_feature = train_data_feature.toarray()

In [37]:
vocab = vectorizer.get_feature_names()
print(len(vocab))
vocab[:10]

20000


['aag',
 'aaron',
 'ab',
 'abandon',
 'abbey',
 'abbi',
 'abbot',
 'abbott',
 'abc',
 'abduct']

In [35]:
dist = np.sum(train_data_feature, axis=0)
dist

array([26, 48, 22, ..., 59, 40, 23], dtype=int64)

In [43]:
dist.shape

(20000,)

In [42]:
pd.DataFrame(dist.reshape(1,len(dist)), columns=vocab)

Unnamed: 0,aag,aaron,ab,abandon,abbey,abbi,abbot,abbott,abc,abduct,...,zombi bloodbath,zombi film,zombi flick,zombi movi,zone,zoo,zoom,zorro,zu,zucker
0,26,48,22,288,24,30,29,30,125,55,...,23,52,37,89,161,31,71,59,40,23


### 4. Training by RandomForest

In [44]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [134]:
forest = RandomForestClassifier(n_estimators=100 ,n_jobs=3, random_state=0)
forest.fit(train_data_feature, train['sentiment'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features=10000, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=3,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [96]:
# cross validation
score = np.mean(cross_val_score(forest, train_data_feature, train['sentiment'], cv=10, scoring='roc_auc'))

In [97]:
score

0.849846464

In [46]:
test['review_cleaning'][0]

'natur film main theme mortal nostalgia loss innoc perhap surpris rate highli older viewer younger one howev craftsmanship complet film anyon enjoy pace steadi constant charact full engag relationship interact natur show need flood tear show emot scream show fear shout show disput violenc show anger natur joyc short stori lend film readi made structur perfect polish diamond small chang huston make inclus poem fit neatli truli masterpiec tact subtleti overwhelm beauti'

In [47]:
# test 데이터에도 적용
test_data_feature = pipeline.fit_transform(test['review_cleaning'])
test_data_feature = test_data_feature.toarray()

In [49]:
test_data_feature

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [48]:
test_data_feature.shape

(25000, 20000)

In [135]:
result = forest.predict(test_data_feature)
result[:10]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

In [136]:
output = pd.DataFrame(data={'id':test['id'], 'sentiment':result})
output.head()

Unnamed: 0,id,sentiment
0,"""12311_10""",1
1,"""8348_2""",1
2,"""5828_4""",1
3,"""7186_2""",1
4,"""12128_7""",1


In [137]:
output.to_csv("tutorial_1_BOW_n_estimators=500_max_depth=5_max_features=10000.csv", index=False, quoting=3)

In [138]:
output_sentiment = output['sentiment'].value_counts()
print(output_sentiment[0] - output_sentiment[1])
output_sentiment

-23958


1    24479
0      521
Name: sentiment, dtype: int64

성적이...똥망진창...망할