In [24]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

from wordcloud import WordCloud
from wordcloud import ImageColorGenerator
from wordcloud import STOPWORDS

import ipywidgets as widgets
from ipywidgets import interact, interact_manual

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from scipy.sparse import csr_matrix
from sklearn.decomposition import PCA

In [27]:
from sklearn.decomposition import NMF
from rake_nltk import Rake

# df = pd.read_csv("English_fiction_pre_PCA_3.csv")
# df = df.iloc[:300]
# train_index = list(range(0, 241))
# test_index = list(range(241, 301))

df = pd.read_csv("English_fiction_pre_PCA_3.csv")
train_df = pd.read_csv("original_data/train_indices.csv")
test_df = pd.read_csv("original_data/test_indices.csv")
train_index = train_df["index"].tolist()
test_index = test_df["index"].tolist()

In [28]:
def concat_nouns(text):
    """
    concatenate and apply lowercase lettering to proper nouns like names, publishers, and book titles
    """

    # Step 1: remove leading or ending brackets (if applicable) and internal quote marks
    text = text.strip("[]")
    text = text.replace("'", "")

    # Step 2: If there are multiple nouns, split at the comma
    text = text.split(", ")

    # Step 3: Concatenate each noun and put all letters in lowercase:
    text = [x.replace(" ", "").lower() for x in text]

    # Step 4: Convert the list of tokens to a string
    text = ' '.join(text)

    return text

def add_tokens_to_description(df):
    df["description"] += " " + df["authors"].apply(concat_nouns)
    df["description"] += " " + df["publisher"].apply(concat_nouns)
    df["description"] += " " + df["Title"].str.lower()

def calculate_ngrams_RAKE(text: str):
    r_unigram = Rake()
    r_unigram.extract_keywords_from_text(text)
    
    keyword_dict_scores = r_unigram.get_word_degrees()
    words = list(keyword_dict_scores.keys())
    
    return " ".join(words)

def create_tokens(df, input_column: str, output_column: str):  
    df[output_column] = df["description"].apply(calculate_ngrams_RAKE)

# Calculate tfidf matrix:
def calculate_TFIDF(df, BOW_column: str, train_index, test_index):
    # Split the incoming dataframe into train and test slices base on the list of train and test indices provided:
    X_train_df = df[df["index"].isin(train_index)]
    X_test_df = df[df["index"].isin(test_index)]
    
    #instantiating and generating the tfidf
    vectorizer = TfidfVectorizer()
    X_train_tfidf = vectorizer.fit_transform(X_train_df[BOW_column])
    X_test_tfidf = vectorizer.transform(X_test_df[BOW_column])

    # convert the tfidf matrix to a dense matrix
    dense_X_train_tfidf = X_train_tfidf.toarray()
    dense_X_test_tfidf = X_test_tfidf.toarray()

    # # combine the arrays:
    # dense_X = np.concatenate((dense_X_train_tfidf, dense_X_test_tfidf), axis = 0)

    # Determine the column names for our dense matrix and create a dataframe with the 
    # vocabulary as columns:
    temp_dict = {}
    for counter, i in enumerate(list(vectorizer.vocabulary_.items())):
            temp_dict[i[1]] = i[0]
    
    column_names = []
    for i in range(len(temp_dict)):
        column_names.append(temp_dict[i])

    # Convert the array back into a dataframe:
    scaled_dataframe_X_train=pd.DataFrame(dense_X_train_tfidf, columns= column_names)
    scaled_dataframe_X_test=pd.DataFrame(dense_X_test_tfidf, columns= column_names) 

    return scaled_dataframe_X_train, scaled_dataframe_X_test, column_names

In [29]:
# Add tokens from other columns to the description column, specifically author, title, and publisher
add_tokens_to_description(df)

# Create tokens from the book descriptions and save this in a new column called "tokens"
create_tokens(df, "description", "tokens")

X_train_tfidf, X_test_tfidf, column_names = calculate_TFIDF(df, "tokens", train_index, test_index)

In [147]:
X_train_tfidf.shape

(21419, 80652)

In [30]:
NMF_model = NMF(n_components=20)

# Fit the model to the training tfidf matrix
NMF_model.fit(X_train_tfidf)

# Transform the articles: nmf_features
nmf_features = NMF_model.transform(X_train_tfidf)

# Print the NMF features
print(nmf_features.round(2))

[[0.01 0.   0.   ... 0.   0.   0.  ]
 [0.02 0.   0.02 ... 0.   0.   0.  ]
 [0.   0.   0.   ... 0.   0.   0.03]
 ...
 [0.   0.   0.   ... 0.01 0.   0.03]
 [0.   0.   0.   ... 0.   0.01 0.03]
 [0.01 0.   0.   ... 0.   0.   0.  ]]


In [165]:
nmf_features_test = NMF_model.transform(X_test_tfidf)

In [166]:
print(len(train_index))
nmf_features.shape

print(len(test_index))
nmf_features_test.shape

21419
5355


(5355, 20)

In [31]:
components_df = pd.DataFrame(NMF_model.components_, columns=column_names)

In [198]:
# Print the shape of the DataFrame
print(components_df.shape)

# Select row 3: component
component = components_df.iloc[16]

# Print result of nlargest
print(component.nlargest(20))

(20, 80652)
world        0.492523
must         0.470366
war          0.387650
earth        0.340529
evil         0.310199
battle       0.295096
planet       0.274775
power        0.269674
adventure    0.256672
land         0.251698
dark         0.245385
ancient      0.243311
series       0.240151
find         0.238756
save         0.237210
forces       0.231866
magic        0.221743
mission      0.220035
enemy        0.219619
future       0.218489
Name: 16, dtype: float64


In [181]:
topics = {0: "nostalgia", 
          1: "self-published/debut",
          2: "story/anthology",
          3: "womens_fiction",
          4: "childrens_books",
          5: "classic",
          6: "family_drama",
          7: "digital_books/recreations",
          8: "reproduced",
          9: "murder_mystery",
          10: "reprint",
          11: "bestselling_author",
          12: "romance",
          13: "unkonwn",
          14: "teen",
          15: "novel",
          16: "world/war/historical_fiction",
          17: "unknown",
          18: "young_adult",
          19: "coming_of_age",
         }

In [200]:
X_train_df = df[df["index"].isin(train_index)]
X_test_df = df[df["index"].isin(test_index)]
book_index = 12

# retrieve the nmf features for a predefined index
nmf_feature_list = nmf_features_test[book_index].round(2).tolist()
print(nmf_feature_list)

# calculate the maxium nmf value(s)
max_nmf_value = max(nmf_feature_list)

# find the indices in the nmf feature list to be used to convert the values to category labels
indices = [i for i, x in enumerate(nmf_feature_list) if x == max_nmf_value]
indices

category_label = [topics[i] for i in indices]
print(category_label)
X_test_df["description"].iloc[book_index]

[0.02, 0.0, 0.0, 0.01, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.06, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.02]
['romance']


"Always The Chaperon And Never The Bride... At least, that's the way it was for Lady Annis Wyncherley. If this young widow was to remain as chaperon to society's misses, there could be no hint of scandal attached to her name. Rakes and romance were strictly off-limits, most especially a rogue like the handsome Lord Adam Ashwick! But that proved nearly impossible when Adam made his daughter's chaperon the subject of his relentless seduction. Adam knew any attention from him could destroy Lady Wyncherley's fine reputation. But he was powerless to control the strong desires she aroused in him. And all too soon this reformed rogue was hell-bent on convincing a very stubborn Annis to become his chaperon bride.... nicolacornick harlequin chaperon bride, the (historical romance)"

In [191]:
for i in range(nmf_features.shape[1]):
    location= X_train_df.shape[1]
    X_train_df.insert(location, topics[i], nmf_features[:,i].tolist())

for i in range(nmf_features_test.shape[1]):
    location= X_test_df.shape[1]
    X_test_df.insert(location, topics[i], nmf_features_test[:,i].tolist())

In [183]:
nmf_features_test.shape

(5355, 20)

In [188]:
nmf_features.shape[1]

20

In [192]:
print(X_train_df.shape)
print(X_test_df.shape)

(21419, 39)
(5355, 39)


In [190]:
topics[0]

'nostalgia'

In [193]:
X_train_df.head(2)

Unnamed: 0,Title,description,authors,image,previewLink,publisher,infoLink,categories,index,reviews number,...,reprint,bestselling_author,romance,unkonwn,teen,novel,world/war/historical_fiction,unknown,young_adult,coming_of_age
0,Whispers of the Wicked Saints,Julia Thomas finds her life spinning out of co...,['Veronica Haddon'],http://books.google.com/books/content?id=aRSIg...,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,iUniverse,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,['fiction'],3,32,...,0.001315,0.0,0.040636,0.0,0.000214,0.0,0.010339,0.0,0.0,0.0
1,The Forbidden Stories of Marta Veneranda,"Marta Veneranda, a Latina neoyorkina, finds th...",['Sonia Rivera-Valdes'],http://books.google.com/books/content?id=A7aYb...,http://books.google.nl/books?id=A7aYbAvagu8C&p...,Seven Stories Press,http://books.google.nl/books?id=A7aYbAvagu8C&d...,['fiction'],24,1,...,0.0,0.0,0.009457,0.0,0.0,0.006398,0.000197,0.0,0.0,0.000621


In [194]:
X_train_df.to_csv(
path_or_buf = "X_train_NMF_topics.csv",
index = False
)

X_test_df.to_csv(
path_or_buf = "X_test_NMF_topics.csv",
index = False
)