In [1]:
import os

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

DATA_DIR = os.path.join(os.getcwd(), 'data/raw')

In [4]:
def load_data():
    print('Now Loading...')
    res = {'train': {'label': [], 'data': []},
           'test': {'label': [], 'data': []}}
    for test_or_train in ['test', 'train']:
        for label in ['neg', 'pos']:
            dir_name = os.path.join(DATA_DIR, 'aclimdb', test_or_train, label)
            files = os.listdir(dir_name)
            for file in files:
                with open(os.path.join(dir_name, file)) as f:
                    item = f.read()
                    res[test_or_train]['label'].append(label)
                    res[test_or_train]['data'].append(item)
    X_train, X_test =res['train']['data'], res['test']['data']
    y_train, y_test =res['train']['label'], res['test']['label']
    print('Complete Loading')
    return X_train, X_test, y_train, y_test

In [3]:
def build_pipeline():
    parameters = {'n_estimators': [10, 30, 50, 70, 90, 110, 130, 150], 'max_features': ['auto', 'sqrt', 'log2', None]}
    text_clf = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', GridSearchCV(RandomForestClassifier(), parameters, cv=2, scoring='accuracy', n_jobs=-1)),
                         ])
    return text_clf

In [5]:
X_train, X_test, y_train, y_test = load_data()
text_clf = build_pipeline()
text_clf = text_clf.fit(X_train, y_train)
y_pred = text_clf.predict(X_test)

# Evaluation
print('Accuracy: {}'.format(accuracy_score(y_test, y_pred)))
print(classification_report(y_test, y_pred))

Now Loading...


Complete Loading


Accuracy: 0.84252
             precision    recall  f1-score   support

        neg       0.84      0.85      0.84     12500
        pos       0.85      0.83      0.84     12500

avg / total       0.84      0.84      0.84     25000

