In [24]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

from wordcloud import WordCloud
from wordcloud import ImageColorGenerator
from wordcloud import STOPWORDS

import ipywidgets as widgets
from ipywidgets import interact, interact_manual

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from scipy.sparse import csr_matrix
from sklearn.decomposition import PCA

In [27]:
from sklearn.decomposition import NMF
from rake_nltk import Rake

# df = pd.read_csv("English_fiction_pre_PCA_3.csv")
# df = df.iloc[:300]
# train_index = list(range(0, 241))
# test_index = list(range(241, 301))

df = pd.read_csv("English_fiction_pre_PCA_3.csv")
train_df = pd.read_csv("original_data/train_indices.csv")
test_df = pd.read_csv("original_data/test_indices.csv")
train_index = train_df["index"].tolist()
test_index = test_df["index"].tolist()

In [28]:
def concat_nouns(text):
    """
    concatenate and apply lowercase lettering to proper nouns like names, publishers, and book titles
    """

    # Step 1: remove leading or ending brackets (if applicable) and internal quote marks
    text = text.strip("[]")
    text = text.replace("'", "")

    # Step 2: If there are multiple nouns, split at the comma
    text = text.split(", ")

    # Step 3: Concatenate each noun and put all letters in lowercase:
    text = [x.replace(" ", "").lower() for x in text]

    # Step 4: Convert the list of tokens to a string
    text = ' '.join(text)

    return text

def add_tokens_to_description(df):
    df["description"] += " " + df["authors"].apply(concat_nouns)
    df["description"] += " " + df["publisher"].apply(concat_nouns)
    df["description"] += " " + df["Title"].str.lower()

def calculate_ngrams_RAKE(text: str):
    r_unigram = Rake()
    r_unigram.extract_keywords_from_text(text)
    
    keyword_dict_scores = r_unigram.get_word_degrees()
    words = list(keyword_dict_scores.keys())
    
    return " ".join(words)

def create_tokens(df, input_column: str, output_column: str):  
    df[output_column] = df["description"].apply(calculate_ngrams_RAKE)

# Calculate tfidf matrix:
def calculate_TFIDF(df, BOW_column: str, train_index, test_index):
    # Split the incoming dataframe into train and test slices base on the list of train and test indices provided:
    X_train_df = df[df["index"].isin(train_index)]
    X_test_df = df[df["index"].isin(test_index)]
    
    #instantiating and generating the tfidf
    vectorizer = TfidfVectorizer()
    X_train_tfidf = vectorizer.fit_transform(X_train_df[BOW_column])
    X_test_tfidf = vectorizer.transform(X_test_df[BOW_column])

    # convert the tfidf matrix to a dense matrix
    dense_X_train_tfidf = X_train_tfidf.toarray()
    dense_X_test_tfidf = X_test_tfidf.toarray()

    # # combine the arrays:
    # dense_X = np.concatenate((dense_X_train_tfidf, dense_X_test_tfidf), axis = 0)

    # Determine the column names for our dense matrix and create a dataframe with the 
    # vocabulary as columns:
    temp_dict = {}
    for counter, i in enumerate(list(vectorizer.vocabulary_.items())):
            temp_dict[i[1]] = i[0]
    
    column_names = []
    for i in range(len(temp_dict)):
        column_names.append(temp_dict[i])

    # Convert the array back into a dataframe:
    scaled_dataframe_X_train=pd.DataFrame(dense_X_train_tfidf, columns= column_names)
    scaled_dataframe_X_test=pd.DataFrame(dense_X_test_tfidf, columns= column_names) 

    return scaled_dataframe_X_train, scaled_dataframe_X_test, column_names

In [29]:
# Add tokens from other columns to the description column, specifically author, title, and publisher
add_tokens_to_description(df)

# Create tokens from the book descriptions and save this in a new column called "tokens"
create_tokens(df, "description", "tokens")

X_train_tfidf, X_test_tfidf, column_names = calculate_TFIDF(df, "tokens", train_index, test_index)

In [147]:
X_train_tfidf.shape

(21419, 80652)

In [30]:
NMF_model = NMF(n_components=20)

# Fit the model to the training tfidf matrix
NMF_model.fit(X_train_tfidf)

# Transform the articles: nmf_features
nmf_features = NMF_model.transform(X_train_tfidf)

# Print the NMF features
print(nmf_features.round(2))

[[0.01 0.   0.   ... 0.   0.   0.  ]
 [0.02 0.   0.02 ... 0.   0.   0.  ]
 [0.   0.   0.   ... 0.   0.   0.03]
 ...
 [0.   0.   0.   ... 0.01 0.   0.03]
 [0.   0.   0.   ... 0.   0.01 0.03]
 [0.01 0.   0.   ... 0.   0.   0.  ]]


In [31]:
components_df = pd.DataFrame(NMF_model.components_, columns=column_names)

In [159]:
# Print the shape of the DataFrame
print(components_df.shape)

# Select row 3: component
component = components_df.iloc[15]

# Print result of nlargest
print(component.nlargest(30))

(20, 80652)
novel         0.859089
story         0.662610
century       0.493440
american      0.488379
war           0.360296
life          0.352732
one           0.341700
world         0.306742
first         0.298919
set           0.288706
history       0.285090
written       0.281847
great         0.281543
work          0.272101
published     0.267047
fiction       0.250500
america       0.249732
novels        0.248344
characters    0.247551
tale          0.235711
writer        0.232756
early         0.221420
literary      0.220722
modern        0.217667
two           0.213262
historical    0.207342
also          0.200618
tells         0.197228
love          0.195827
time          0.188737
Name: 15, dtype: float64


In [132]:
topics = {0: "nostalgia", 
          1: "self-published/debut",
          2: "story/anthology",
          3: "womens_fiction",
          4: "childrens_books",
          5: "classic",
          6: "family_drama",
          7: "digital_books/recreations",
          8: "reproduced",
          9: "murder_mystery",
          10: "reprint",
          11: "bestselling_author",
          12: "romance",
          13: "unkonwn",
          14: "teen",
          15: "novel",
          16: "world/battle",
          17: "unknown",
          18: "young_adult",
          19: "coming_of_age",
         }

In [161]:
X_train_df = df[df["index"].isin(train_index)]
X_test_df = df[df["index"].isin(test_index)]
book_index = 95

print(nmf_features[book_index].round(2))

# retrieve the nmf features for a predefined index
nmf_feature_list = nmf_features[book_index].round(2).tolist()

# calculate the maxium nmf value(s)
max_nmf_value = max(nmf_feature_list)

# find the indices in the nmf feature list to be used to convert the values to category labels
indices = [i for i, x in enumerate(nmf_feature_list) if x == max_nmf_value]
indices

category_label = [topics[i] for i in indices]
print(category_label)
X_train_df["description"].iloc[book_index]

[0.02 0.   0.   0.   0.   0.   0.   0.02 0.   0.   0.   0.   0.   0.
 0.06 0.   0.   0.   0.03 0.  ]
['teen']


"Sixteen-year-old Haley Andromeda would like to think she's just a normal high school senior, but during a disastrous time in her life, she turns to the Ouija board and tries to communicate with the Other Side, which only leads to further complications. Original. karenrivers raincoastbooks the healing time of hickeys"