In [1]:
# 使用朴素贝叶斯算法， MapReduce 框架
# 预测博客博主的性别

import os
import re
import numpy as np
from collections import defaultdict
from operator import itemgetter

In [2]:
word_search_re = re.compile(r"[\w']+")

In [10]:
words = word_search_re.findall("haha ijij 1234123j 123kj1k23j kang ksjdflskd ksdjflksdjfk ewrkje mike 1 2 3 a b c A B C 1 2")
print(type(words))
for word in set(words):
    print(word)

<class 'list'>
ijij
3
2
B
haha
C
kang
mike
1234123j
1
ksjdflskd
b
A
a
123kj1k23j
ewrkje
c
ksdjflksdjfk


In [3]:
def load_model(model_filename):
    model = defaultdict(lambda: defaultdict(float))
    with open(model_filename) as inf:
        for line in inf:
            word, values = line.split(maxsplit=1)
            word = eval(word)
            values = eval(values)
            model[word] = values
    return model

In [4]:
model_filename = os.path.join(os.path.expanduser("~"), "models", "part-00000")
model = load_model(model_filename)

In [5]:
model["i"]["male"], model["i"]["female"]

(409.7987003114851, 513.3231594734408)

In [6]:
def nb_predict(model, document):
    words = word_search_re.findall(document)
    probabilities = defaultdict(lambda : 0)
    for word in set(words):
        probabilities["male"] += np.log(model[word].get("male", 1e-5))
        probabilities["female"] += np.log(model[word].get("female", 1e-5))
    # Now find the most likely gender
    most_likely_genders = sorted(probabilities.items(), key=itemgetter(1), reverse=True)
    return most_likely_genders[0][0]

In [7]:
new_post = """ Every day should be a half day.  Took the afternoon off to hit the dentist, and while I was out I managed to get my oil changed, too.  Remember that business with my car dealership this winter?  Well, consider this the epilogue.  The friendly fellas at the Valvoline Instant Oil Change on Snelling were nice enough to notice that my dipstick was broken, and the metal piece was too far down in its little dipstick tube to pull out.  Looks like I'm going to need a magnet.   Damn you, Kline Nissan, daaaaaaammmnnn yooouuuu....   Today I let my boss know that I've submitted my Corps application.  The news has been greeted by everyone in the company with a level of enthusiasm that really floors me.     The back deck has finally been cleared off by the construction company working on the place.  This company, for anyone who's interested, consists mainly of one guy who spends his days cursing at his crew of Spanish-speaking laborers.  Construction of my deck began around the time Nixon was getting out of office.
"""

In [8]:
nb_predict(model, new_post)

'male'

In [9]:
testing_folder = os.path.join(os.path.expanduser("~"), "Data", "blogposts_testing")
testing_filenames = []
for filename in os.listdir(testing_folder):
    testing_filenames.append(os.path.join(testing_folder, filename))

In [10]:
def nb_predict_many(model, input_filename):
    with open(input_filename) as inf:
        # remove leading and trailing whitespace
        for line in inf:
            tokens = line.split()
            actual_gender = eval(tokens[0])
            blog_post = eval(" ".join(tokens[1:]))
            yield actual_gender, nb_predict(model, blog_post)

In [11]:
def nb_predict(model, document):
    words = word_search_re.findall(document)
    probabilities = defaultdict(lambda : 1)
    for word in set(words):
        probabilities["male"] += np.log(model[word].get("male", 1e-15))
        probabilities["female"] += np.log(model[word].get("female", 1e-15))
    # Now find the most likely gender
    most_likely_genders = sorted(probabilities.items(), key=itemgetter(1), reverse=True)
    return most_likely_genders

In [13]:
y_true = []
y_pred = []
for testing_filename in testing_filenames:
    for actual_gender, ratios in nb_predict_many(model, testing_filename):
        predicted_gender = ratios[0][0]
        y_true.append(actual_gender == "female")
        y_pred.append(predicted_gender == "female")
y_true = np.array(y_true, dtype='int')
y_pred = np.array(y_pred, dtype='int')

In [14]:
from sklearn.metrics import f1_score
print("f1={:.4f}".format(f1_score(y_true, y_pred, pos_label=None)))
print("acc={:.4f}".format(np.mean(y_true == y_pred)))

f1=0.5540
acc=0.5765


In [15]:
aws_model_filename = os.path.join(os.path.expanduser("~"), "models", "model_aws")
aws_model = load_model(aws_model_filename)

In [16]:
y_true = []
y_pred = []
for testing_filename in testing_filenames:
    for actual_gender, predicted_gender in nb_predict_many(aws_model, testing_filename):
        predicted_gender = ratios[0][0]
        y_true.append(actual_gender == "female")
        y_pred.append(predicted_gender == "female")
        #print("Actual: {0}\tPredicted: {1}".format(actual_gender, predicted_gender))
        if len(y_true) > 500:
            break
y_true = np.array(y_true, dtype='int')
y_pred = np.array(y_pred, dtype='int')

In [17]:
print("f1={:.4f}".format(f1_score(y_true, y_pred, pos_label=None)))
print("acc={:.4f}".format(np.mean(y_true == y_pred)))

f1=0.8144
acc=0.8734


  'precision', 'predicted', average, warn_for)


In [18]:
print(list(zip(y_true, y_pred))[:10])

[(0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0)]


In [19]:
from sklearn.metrics import confusion_matrix

In [20]:
confusion_matrix(y_true, y_pred)

array([[614,   0],
       [ 89,   0]])