In [42]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import linear_model
from sklearn import datasets
from yellowbrick.target import FeatureCorrelation
import seaborn as sns
from datetime import datetime

Najprej uvozimo podatke, ki jih preberemo iz dveh .csv datotek in shranimo v pandas dataframe. Poimenujemo ga kar df.

In [43]:
df_1 = pd.read_csv(".\\audiobooks.csv")
df_2 = pd.read_csv(".\\audiobooks_2.csv")
df = df_1.merge(df_2, on='audiobook_id').set_index('audiobook_id').drop("Unnamed: 0_x",axis=1)[['categories', 'summary']].dropna()

df.head()

Unnamed: 0_level_0,categories,summary
audiobook_id,Unnamed: 1_level_1,Unnamed: 2_level_1
B09RQ4HRHN,Arts & Entertainment,The beloved star of Friends takes us behind th...
B09SBP7MWN,Romance,"Lily and her ex-husband, Ryle, have just settl..."
B09R62PV4B,"Mystery, Thriller & Suspense",Charlie Reade looks like a regular high school...
B09VHWHJS2,Biographies & Memoirs,Jennette McCurdy was six years old when she ha...
1524779261,Business & Careers,"No matter your goals, Atomic Habits offers a p..."


In [44]:
def root(word):
    word = ''.join(i for i in word if i.isalpha())
    if not word:
        return '$'
    end = len(word) - 1
    if word[end] in 'ds':
        end -= 1
    while end >= 0 and word[end] in 'aeiou':
        end -= 1
    return word[:end + 1]

def lexical_roots(string):
    return pd.Series(sorted({
        root(word) for word in string.strip("'.,").replace('-', ' ').lower().split() if word
    }))

In [45]:
cat_df = df[['categories', 'summary']][:535].reset_index()
category_prob = cat_df.groupby('categories').size() / len(df)
book_roots = df.summary.apply(
    lexical_roots
).stack(
).reset_index(
    level='audiobook_id'
).rename(columns={
    'audiobook_id': 'audiobook_id',
    0: 'root',
})
cat_roots = pd.merge(
    book_roots,
    cat_df
)[['root', 'categories']]
roots_df = pd.crosstab(cat_roots.root, cat_roots.categories)
cat_root_prob_df = roots_df / cat_df.groupby('categories').size() + 0.001

In [46]:
def predict_categories(summary):
    factors = category_prob * cat_root_prob_df[
        cat_root_prob_df.index.isin(
            lexical_roots(summary)
        )
    ].prod()
    factors /= max(factors)
    return factors.sort_values(ascending=False)

Če si pogledamo, kaj nam vrne funkcija predict_categories:

In [58]:
predict_categories("No grammar tests. No memory drills. No chance of failure. Welcome to Learn with Paul Noble – a unique, tried and tested language learning method that has been used by almost a million people to speak fluently and confidently in no time at all. This course covers European and Latin American Spanish. Take a simple, relaxed approach to learning a language that has been proven to succeed every time. Unlike more traditional language learning courses, Paul Noble’s unique method has no grammar tests, no memory drills and no chance of failure.")

categories
Science Fiction & Fantasy                          1.000000e+00
Business & Careers                                 8.325207e-01
Literature & Fiction                               2.468731e-01
Health & Wellness                                  4.866677e-05
Biographies & Memoirs                              1.996715e-07
Romance                                            8.844512e-08
Mystery, Thriller & Suspense                       1.843687e-09
Teen & Young Adult                                 1.550756e-11
Religion & Spirituality                            3.156123e-13
Relationships, Parenting & Personal Development    2.097141e-14
History                                            5.857204e-15
Money & Finance                                    3.512636e-21
Politics & Social Sciences                         4.315861e-22
LGBTQ+                                             3.705254e-23
Children\'s Audiobooks                             2.862805e-25
Arts & Entertainment         

Sestavimo nov dataframe poimenovan predictions_df. Predstavlja preostalih 30% podatkov, ki jih še nismo uporabili. Za vsako knjigo podamo kategorijo, kateri pripada, nato pa še dve kategoriji, ki jih naš algoritem izbere za najbolj verjetni.
Že na prvi pogled izgleda da naš algoritem ni najbolj natančen. Poleg tega, da smo ga res zelo poenostavili, k temu prispeva tudi tako majhna količina podatkov, na katerih smo model sestavili. Če bi imeli na razpolago več avdioknjig, bi bil tudi model boljši.
Natančnost še zares izračunajmo. Za vsako avdioknjigo izračunajmo uspeh, torej verjetnost, da pripada pravi kategoriji deljimo z vsoto  vvrednosti vseh kategorij. Nato pa vzamemo še povprečje izračunov od vseh testiranih avdioknjig.

In [52]:
predictions_df = df[['categories','summary']][535:].reset_index()
def prediction_score(summary, category):
    prediction_table = predict_categories(summary)
    if category not in prediction_table.index:
        return 0
    n =  prediction_table[category]
    return n/(sum(prediction_table))

predictions_df['prediction'] = predictions_df.apply(lambda x: prediction_score(x['summary'],x['categories']), axis=1)
predictions_df

Unnamed: 0,audiobook_id,categories,summary,prediction
0,B07MFZRNGL,Teen & Young Adult,Nikolai Lantsov has always had a gift for the ...,5.168930e-21
1,0593349016,Biographies & Memoirs,An enthralling account of a modern voyage of d...,1.406146e-06
2,B085VCZ9PZ,"Mystery, Thriller & Suspense",A centuries-old skeleton is discovered in a lo...,1.601232e-02
3,B01JKDTP26,Literature & Fiction,Ruth Jefferson is a labor and delivery nurse a...,9.999997e-01
4,B0B2HTH27Y,"Relationships, Parenting & Personal Development",We’re never going to be able to prevent people...,3.338379e-06
...,...,...,...,...
225,B0BK2RGQBT,Biographies & Memoirs,When Michael Cohen's secret payoff to porn sta...,8.117024e-01
226,B09V1X64YV,Science Fiction & Fantasy,"For years, the ancient alien AI known as Skipp...",9.999430e-01
227,0062963708,Literature & Fiction,"At the end of the Second World War, Cyril Conr...",9.978186e-01
228,1250752337,"Mystery, Thriller & Suspense","One night, Molly Clarke walked away from her l...",2.554186e-07


In [57]:
def model_score(df):
    return df['prediction'].sum()/(len(df))
model_score(predictions_df)

0.4604782962731438