In [6]:
import nltk
import numpy as np
import os
import re
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.metrics import accuracy_score

stemmer = SnowballStemmer('english')
def tokenize_stem(text):
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    # print(tokens)
    final_tokens = []
    for token in tokens:
#         if re.search('[a-zA-Z]', token):
            # print(token)
        final_tokens.append(token)
    stems = [stemmer.stem(t) for t in final_tokens]

    return final_tokens #stems

path="../dataset/poster_txt"
adventure_list = os.listdir(path + "/Adventure")
documentary_list = os.listdir(path + "/Documentary")
horror_list = os.listdir(path + "/Horror")
romance_list = os.listdir(path + "/Romance")

size = min(len(adventure_list), len(documentary_list), len(horror_list), len(romance_list))
lines = []
label = []
for i in range(size):
    with open(path + "/Adventure/" + adventure_list[i], "r", encoding="utf-8") as fin:
        lines.append(fin.readline())
        label.append('Adventure')
    with open(path + "/Documentary/" + documentary_list[i], "r", encoding="utf-8") as fin:
        lines.append(fin.readline())
        label.append('Documentary')
    with open(path + "/Horror/" + horror_list[i], "r", encoding="utf-8") as fin:
        lines.append(fin.readline())
        label.append('Horror')
    with open(path + "/Romance/" + romance_list[i], "r", encoding="utf-8") as fin:
        lines.append(fin.readline())
        label.append('Romance')






In [7]:
for maxdf in [1.0, 0.9, 0.8]:
    for mindf in [0.3, 0.2, 0.1, 0.0]:
        tfidf_vectorizer = TfidfVectorizer(max_features=20000,  max_df=maxdf, min_df=mindf, tokenizer=tokenize_stem)
        tfidf_matrix = tfidf_vectorizer.fit_transform(lines)

        X_train, X_test, Y_train, Y_test = train_test_split(tfidf_matrix, label, test_size = 0.2)

        mnb= MultinomialNB()
        mnb.fit(X_train, Y_train)

        pred = mnb.predict(X_test)
        accuracy = accuracy_score(Y_test, pred)
        print("Max_df : {:.1f}, Min_df: {:.1f}, Accuracy: {:.4f}".format(maxdf, mindf, accuracy))

Max_df : 1.0, Min_df: 0.3, Accuracy: 0.4467
Max_df : 1.0, Min_df: 0.2, Accuracy: 0.5067
Max_df : 1.0, Min_df: 0.1, Accuracy: 0.5558
Max_df : 1.0, Min_df: 0.0, Accuracy: 0.8208
Max_df : 0.9, Min_df: 0.3, Accuracy: 0.4533
Max_df : 0.9, Min_df: 0.2, Accuracy: 0.5017
Max_df : 0.9, Min_df: 0.1, Accuracy: 0.5350
Max_df : 0.9, Min_df: 0.0, Accuracy: 0.8250
Max_df : 0.8, Min_df: 0.3, Accuracy: 0.4200
Max_df : 0.8, Min_df: 0.2, Accuracy: 0.4733
Max_df : 0.8, Min_df: 0.1, Accuracy: 0.5342
Max_df : 0.8, Min_df: 0.0, Accuracy: 0.8308


In [3]:


for maxdf in [1.0, 0.9, 0.8]:
    for mindf in [0.3, 0.2, 0.1, 0.0]:
        count_vectorizer = CountVectorizer(max_features=50000,  max_df = maxdf, min_df = mindf, stop_words='english', tokenizer=tokenize_stem)
        count_matrix = count_vectorizer.fit_transform(lines)

        X_train, X_test, Y_train, Y_test = train_test_split(count_matrix, label, test_size = 0.3)

        mnb= MultinomialNB()
        mnb.fit(X_train, Y_train)

        pred = mnb.predict(X_test)
        accuracy = accuracy_score(Y_test, pred)
        print("Max_df : {:.1f}, Min_df: {:.1f}, Accuracy: {:.4f}".format(maxdf, mindf, accuracy))


Max_df : 1.0, Min_df: 0.3, Accuracy: 0.2472
Max_df : 1.0, Min_df: 0.2, Accuracy: 0.2383
Max_df : 1.0, Min_df: 0.1, Accuracy: 0.4328
Max_df : 1.0, Min_df: 0.0, Accuracy: 0.8261
Max_df : 0.9, Min_df: 0.3, Accuracy: 0.2472
Max_df : 0.9, Min_df: 0.2, Accuracy: 0.2444
Max_df : 0.9, Min_df: 0.1, Accuracy: 0.4017
Max_df : 0.9, Min_df: 0.0, Accuracy: 0.8167
Max_df : 0.8, Min_df: 0.3, Accuracy: 0.2433
Max_df : 0.8, Min_df: 0.2, Accuracy: 0.2389
Max_df : 0.8, Min_df: 0.1, Accuracy: 0.4367
Max_df : 0.8, Min_df: 0.0, Accuracy: 0.8183


In [None]:
mnb= MultinomialNB()
mnb.fit(X_train, Y_train)

In [None]:
pred = mnb.predict(X_test)

accuracy = accuracy_score(Y_test, pred)
print("%.6f"%accuracy)