In [None]:
from urllib import request
from eventlet import GreenPool
import os
import pandas as pd
import regex
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics


In [None]:
book_files={
 "Mickiewicz": [
 "https://wolnelektury.pl/media/book/txt/pan-tadeusz.txt",
 "https://wolnelektury.pl/media/book/txt/dziady-dziady-widowisko-czesc-i.txt",
 "https://wolnelektury.pl/media/book/txt/dziady-dziadow-czesci-iii-ustep-do-przyjaciol-moskali.txt",
 "https://wolnelektury.pl/media/book/txt/ballady-i-romanse-pani-twardowska.txt",
 "https://wolnelektury.pl/media/book/txt/ballady-i-romanse-powrot-taty.txt",
 "https://wolnelektury.pl/media/book/txt/ballady-i-romanse-switez.txt",
 "https://wolnelektury.pl/media/book/txt/dziady-dziady-poema-dziady-czesc-iv.txt",
 ],
 "Sienkiewicz": [
 "https://wolnelektury.pl/media/book/txt/quo-vadis.txt",
 "https://wolnelektury.pl/media/book/txt/sienkiewicz-we-mgle.txt",
 "https://wolnelektury.pl/media/book/txt/potop-tom-pierwszy.txt",
 "https://wolnelektury.pl/media/book/txt/potop-tom-drugi.txt",
 "https://wolnelektury.pl/media/book/txt/potop-tom-trzeci.txt",
 ],
 "Orzeszkowa": [
 "https://wolnelektury.pl/media/book/txt/orzeszkowa-kto-winien.txt",
 "https://wolnelektury.pl/media/book/txt/nad-niemnem-tom-pierwszy.txt",
 "https://wolnelektury.pl/media/book/txt/nad-niemnem-tom-drugi.txt",
 "https://wolnelektury.pl/media/book/txt/nad-niemnem-tom-trzeci.txt",
 "https://wolnelektury.pl/media/book/txt/gloria-victis-dziwna-historia.txt",
 "https://wolnelektury.pl/media/book/txt/z-pozogi.txt",
 "https://wolnelektury.pl/media/book/txt/pani-dudkowa.txt",
 "https://wolnelektury.pl/media/book/txt/dymy.txt",
 "https://wolnelektury.pl/media/book/txt/syn-stolarza.txt",
 "https://wolnelektury.pl/media/book/txt/dobra-pani.txt",
 "https://wolnelektury.pl/media/book/txt/cnotliwi.txt",
 "https://wolnelektury.pl/media/book/txt/kilka-slow-o-kobietach.txt",
 "https://wolnelektury.pl/media/book/txt/patryotyzm-i-kosmopolityzm.txt",
 "https://wolnelektury.pl/media/book/txt/julianka.txt",
 ],
 "Prus": [
 "https://wolnelektury.pl/media/book/txt/lalka-tom-drugi.txt",
 "https://wolnelektury.pl/media/book/txt/lalka-tom-pierwszy.txt",
 "https://wolnelektury.pl/media/book/txt/antek.txt",
 "https://wolnelektury.pl/media/book/txt/katarynka.txt",
 "https://wolnelektury.pl/media/book/txt/prus-anielka.txt",
 "https://wolnelektury.pl/media/book/txt/prus-placowka.txt",
 
 ],
 "Reymont": [
 "https://wolnelektury.pl/media/book/txt/ziemia-obiecana-tom-pierwszy.txt",
 "https://wolnelektury.pl/media/book/txt/chlopi-czesc-pierwsza-jesien.txt",
 "https://wolnelektury.pl/media/book/txt/reymont-chlopi-zima.txt",
 "https://wolnelektury.pl/media/book/txt/chlopi-czesc-trzecia-wiosna.txt",
 "https://wolnelektury.pl/media/book/txt/chlopi-czesc-czwarta-lato.txt",
 ]
}

In [None]:
def fetch(url):
    file_path = os.path.join("./data/",os.path.basename(url))
    if os.path.exists(file_path):
        return None, None
    data = request.urlopen(url).read()
    return file_path, data

os.mkdir('data')

for author in book_files:
    pool = GreenPool()
    
    for file_path, data in pool.imap(fetch, book_files[author]):
        if file_path:
            with open(file_path, mode="wb") as f:
                f.write(data)
print ("DONE")




In [None]:

def preprocess_file(file_path=None, file_url=None):
    if not file_path and file_url:
        file_path = os.path.join("data",os.path.basename(file_url))
        
    text = open(file_path,'rb').read().decode("utf-8").lower()

    text = regex.sub(u"[^ \n\p{Latin}\-'.?!]", " ",text)
    text = regex.sub(u"[ \n]+", " ", text) 
    text = regex.sub(r"----- ta lektura.*","", text) 

    return [regex.sub(r"^ ","",l) for l in regex.split('\.|,|\?|!|:',text)]


def get_book_df(document, author):
    return pd.DataFrame({
        'author': pd.Series(len(document)*[author]),
        'txt': pd.Series(document),
    })
    
book_lines_df = pd.concat([
    get_book_df(preprocess_file(file_url=url),author=author) 
        for author in book_files for url in book_files[author] 
])

book_lines_df.head()

In [None]:
book_lines_df.groupby('author').count()

In [None]:
book_lines_df['words'] = book_lines_df['txt'].apply(lambda row: len(row.split()))
book_lines_df.groupby('author')['words'].describe()

In [None]:
book_lines_df.groupby('author')['words'].quantile(0.98)

In [None]:
train_df, test_df = model_selection.train_test_split(
    book_lines_df, 
    test_size=0.1, 
    stratify=book_lines_df['author'])

In [None]:
vect = CountVectorizer()
vect.fit(train_df['txt'])
sample_sentence = train_df.iloc[2]['txt']
vect.transform([sample_sentence])

In [None]:
X_train = vect.transform(train_df['txt'])
X_test=vect.transform(test_df['txt'])
model = LogisticRegression(class_weight='balanced', dual=True)
model.fit(X_train, train_df['author'])

In [None]:
model.score(X_test, test_df['author'])

In [None]:
target = test_df['author']
predicted = model.predict(X_test)
print(metrics.classification_report(target, predicted))