In [29]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import pandas as pd
import os
import re
from string import punctuation
from datetime import datetime

DIRNAME = 'blogs/'

english_stopwords = set(stopwords.words('english') + list(punctuation) + 
                       ['...', '..', '....', '``', "''", '/n'])

In [30]:
def clean_wordlist(filelist):
    '''
    Takes a list of filenames with XML code, opens these and provides 
    a word list for all the posts, lemmatized without stopwords.
    '''
    corpus = ''
    lemmatizer = WordNetLemmatizer()
    for eachfile in filelist:
        xmltext = open(eachfile, encoding = 'utf8', errors = 'replace').read()
        soup = BeautifulSoup(xmltext, features = 'xml')
        corpus += '/n'.join([x.text.lower() for x in soup.findAll('post')])
    wordlist = [lemmatizer.lemmatize(x) for x in word_tokenize(corpus)
               if x not in english_stopwords]
    return wordlist

In [31]:
blogfiles = [ DIRNAME + fn for fn in os.listdir(DIRNAME) if not fn.startswith('.')]

In [32]:
blogteens = [ 1 if re.search(r'\.1[3-9]\.', x) else 0 for x in blogfiles]

In [11]:
list(zip(blogfiles,blogteens))[:25]

[('blogs/3699726.female.14.indUnk.Aries.xml', 1),
 ('blogs/3362256.male.15.indUnk.Scorpio.xml', 1),
 ('blogs/4270277.male.23.indUnk.Aquarius.xml', 0),
 ('blogs/3473863.female.26.Internet.Capricorn.xml', 0),
 ('blogs/2889926.female.25.Telecommunications.Cancer.xml', 0),
 ('blogs/3897591.male.15.Student.Libra.xml', 1),
 ('blogs/4216005.male.23.Consulting.Taurus.xml', 0),
 ('blogs/4251559.female.23.Non-Profit.Gemini.xml', 0),
 ('blogs/3584627.female.17.Student.Scorpio.xml', 1),
 ('blogs/4292654.female.33.Banking.Libra.xml', 0),
 ('blogs/4268277.female.14.indUnk.Leo.xml', 1),
 ('blogs/4176187.female.13.indUnk.Aquarius.xml', 1),
 ('blogs/3837608.male.24.Communications-Media.Sagittarius.xml', 0),
 ('blogs/3818017.male.15.indUnk.Aries.xml', 1),
 ('blogs/3476502.female.16.Education.Aries.xml', 1),
 ('blogs/3840468.male.23.indUnk.Gemini.xml', 0),
 ('blogs/3321827.male.38.Technology.Sagittarius.xml', 0),
 ('blogs/3877768.male.25.Technology.Libra.xml', 0),
 ('blogs/4298893.female.25.Consulting.Li

In [33]:
print(datetime.now().strftime('%H:%M:%S'))
allblogs = [ ' '.join(clean_wordlist([x])) for x in blogfiles ]
print(datetime.now().strftime('%H:%M:%S'))

13:21:44
13:30:00


In [34]:
df = pd.DataFrame({'filename':blogfiles,
                  'teen':blogteens,
                   'text':allblogs})

In [35]:
X_train, X_test, Y_train, Y_test = train_test_split(df.text, df.teen,
                                                   random_state=1) 

X here is the text and Y here is the whether it is a teen blog
Can X determine Y?

In [36]:
tfidf_vectorize = TfidfVectorizer(use_idf=True)
X_train_tf = tfidf_vectorize.fit_transform(X_train)
X_test_tf = tfidf_vectorize.transform(X_test)

In [37]:
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train_tf, Y_train)
predictions = naive_bayes.predict(X_test_tf)

In [38]:
print('Accuracy: ', accuracy_score(Y_test, predictions))

Accuracy:  0.6575569358178054


Accuracy still around 65% when using the entire dataset.