##### tf-idf를 통해 벡터화해보고, k-fold를 사용해서 cross_validation을 진행해보자

In [1]:
train = pd.read_csv("../../labeledTrainData.tsv", delimiter='\t', quoting=3)
test = pd.read_csv("../../testData.tsv", delimiter='\t', quoting=3)

In [2]:
# 데이터 전처리는 html코드 제거까지만
# 여기에 병렬처리하도록 멀티프로세싱 코드 추가
from KaggleWord2VecUtility import KaggleWord2VecUtility
from bs4 import BeautifulSoup

def review_to_words(raw_review):
    review_text = BeautifulSoup(raw_review, 'html.parser').get_text()
    return review_text

In [3]:
train['review_clean'] = train['review'].map(review_to_words)

In [4]:
test['review_clean'] = test['review'].map(review_to_words)

In [5]:
x_train = train['review_clean']
x_test = test['review_clean']

In [8]:
import nltk
nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Juno\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


True

In [9]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from nltk.corpus import words

vectorizer = CountVectorizer(analyzer='word',
                             lowercase= True,
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = 'english',
                             min_df = 2,
                             ngram_range = (1,3),
                             vocabulary = set(words.words()),
                             max_features = 90000)

In [10]:
pipeline = Pipeline([('vect', vectorizer),('tfidf', TfidfTransformer(smooth_idf= False))])
pipeline

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=90000, min_df=2,
        ngram_range=(1, 3), preprocessor=None, stop_words='english',
       ...('tfidf', TfidfTransformer(norm='l2', smooth_idf=False, sublinear_tf=False,
         use_idf=True))])

In [11]:
x_train_tfidf_vector = pipeline.fit_transform(x_train)

  idf = np.log(float(n_samples) / df) + 1.0


In [12]:
vocab = vectorizer.get_feature_names()
print(len(vocab))
vocab[:10]

235892


['A',
 'Aani',
 'Aaron',
 'Aaronic',
 'Aaronical',
 'Aaronite',
 'Aaronitic',
 'Aaru',
 'Ab',
 'Ababdeh']

In [13]:
x_test_tfidf_vector = pipeline.fit_transform(x_test)

  idf = np.log(float(n_samples) / df) + 1.0


In [14]:
from sklearn.ensemble import RandomForestClassifier

In [16]:
forest = RandomForestClassifier(n_estimators=100, random_state=2018)
forest.fit(x_train_tfidf_vector, train['sentiment'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=2018, verbose=0,
            warm_start=False)

In [19]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

k_fold = KFold(n_splits=5, shuffle=True, random_state=2018)
score = np.mean(cross_val_score(forest, x_train_tfidf_vector, train['sentiment'], cv=k_fold, scoring='roc_auc', n_jobs=-1))

In [20]:
score

0.9206811206356388

In [21]:
result = forest.predict(x_test_tfidf_vector)

In [22]:
output = pd.DataFrame(data={'id':test['id'], "sentiment":result})
output.tail()

Unnamed: 0,id,sentiment
24995,"""2155_10""",1
24996,"""59_10""",1
24997,"""2531_1""",1
24998,"""7772_8""",1
24999,"""11465_10""",1


In [23]:
output.to_csv("tutorial4_tfidf_{0:.5f}".format(score), index=False, quoting=3)