### Import Libraries

In [18]:
import pandas as pd
import numpy as np

import statistics as stats

import pickle

In [19]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet

### Data Cleaning

In [20]:
books = pd.read_csv("dataset.csv")
books.head(2)

Unnamed: 0,authors,bestsellers-rank,categories,description,dimension-x,dimension-y,dimension-z,edition,edition-statement,for-ages,...,isbn10,isbn13,lang,publication-date,publication-place,rating-avg,rating-count,title,url,weight
0,[1],49848.0,"[214, 220, 237, 2646, 2647, 2659, 2660, 2679]",SOLDIER FIVE is an elite soldier's explosive m...,129.0,198.0,20.0,,,,...,184018907X,9781840189070,en,2004-10-14 00:00:00,,4.03,292.0,Soldier Five : The Real Truth About The Bravo ...,/Soldier-Five-Mike-Coburn/9781840189070,224.0
1,"[2, 3]",115215.0,"[235, 3386]",John Moran and Carl Williams were the two bigg...,127.0,203.2,25.4,,,,...,184454737X,9781844547371,en,2009-03-13 00:00:00,,3.6,335.0,Underbelly : The Gangland War,/Underbelly-Andrew-Rule/9781844547371,285.76


In [21]:
books.shape

(1109383, 28)

In [22]:
books.columns

Index(['authors', 'bestsellers-rank', 'categories', 'description',
       'dimension-x', 'dimension-y', 'dimension-z', 'edition',
       'edition-statement', 'for-ages', 'format', 'id', 'illustrations-note',
       'image-checksum', 'image-path', 'image-url', 'imprint', 'index-date',
       'isbn10', 'isbn13', 'lang', 'publication-date', 'publication-place',
       'rating-avg', 'rating-count', 'title', 'url', 'weight'],
      dtype='object')

In [23]:
#check mising values

books.isna().sum()

authors                     0
bestsellers-rank       466842
categories                  0
description             80087
dimension-x             48227
dimension-y             93531
dimension-z             48227
edition                926569
edition-statement      747261
for-ages              1033390
format                   6622
id                          0
illustrations-note     752907
image-checksum             27
image-path                 27
image-url                  27
imprint                830049
index-date            1109383
isbn10                      0
isbn13                      0
lang                    60407
publication-date         2603
publication-place     1109383
rating-avg             440130
rating-count           440130
title                       0
url                         0
weight                  87173
dtype: int64

In [24]:
# drop unecessary columns (too many na or not important information)

columns_drop = ["dimension-x", "dimension-y", "dimension-z", "bestsellers-rank", "edition", 
                "edition-statement", "for-ages", "illustrations-note", "imprint", "index-date", 
                "publication-place", "rating-avg", "rating-count", "weight"]

books = books.drop(columns=columns_drop)

In [25]:
# drop row if description is na --> description is crucial

books = books[books['description'].notna()]

In [26]:
books["lang"].value_counts()

en     952244
es      23345
de      15628
fr       6643
pl       2901
        ...  
ae          1
lad         1
dak         1
ug          1
rm          1
Name: lang, Length: 155, dtype: int64

In [27]:
# only keep English books

books = books[books["lang"] == "en"]

In [28]:
books.shape

(952244, 14)

In [29]:
# combine title and description for complete analysis of text in nlp

books["text"] = books["title"] + " " + books["description"]

In [30]:
books.dtypes

# should authors and categories be other type?

# publication date as datetime

authors              object
categories           object
description          object
format              float64
id                    int64
image-checksum       object
image-path           object
image-url            object
isbn10               object
isbn13                int64
lang                 object
publication-date     object
title                object
url                  object
text                 object
dtype: object

In [31]:
books['publication-date'] = pd.to_datetime(books['publication-date'])


In [32]:
books.head()

Unnamed: 0,authors,categories,description,format,id,image-checksum,image-path,image-url,isbn10,isbn13,lang,publication-date,title,url,text
0,[1],"[214, 220, 237, 2646, 2647, 2659, 2660, 2679]",SOLDIER FIVE is an elite soldier's explosive m...,1.0,9781840189070,97c8e71f2ec114b34f243074d2091077,full/c/5/2/c529152ea1246c0cb17d6574d302eae6d2e...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,184018907X,9781840189070,en,2004-10-14,Soldier Five : The Real Truth About The Bravo ...,/Soldier-Five-Mike-Coburn/9781840189070,Soldier Five : The Real Truth About The Bravo ...
1,"[2, 3]","[235, 3386]",John Moran and Carl Williams were the two bigg...,1.0,9781844547371,1dc4e79575474ac775cdfe9840bdfb94,full/5/9/8/598ae8f736c99bc423352887d406d2b2f94...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,184454737X,9781844547371,en,2009-03-13,Underbelly : The Gangland War,/Underbelly-Andrew-Rule/9781844547371,Underbelly : The Gangland War John Moran and C...
3,"[5, 6, 7, 8]","[377, 2978, 2980]",The Third Book of General Ignorance gathers t...,1.0,9780571308996,bc593914f06e1021be9977114ea2a28f,full/1/a/f/1af9cad94dfe897b4423993c78b137bee40...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,571308996,9780571308996,en,2015-10-01,QI: The Third Book of General Ignorance,/QI-Third-Book-General-Ignorance-John-Lloyd/97...,QI: The Third Book of General Ignorance The Th...
4,[9],"[2813, 2980]",The Try Guys deliver their first book-an inspi...,2.0,9780008352516,39c0422c00be90fb9258dd6df9068b1e,full/e/e/a/eea0ff6b6f1882d1862c711a2008e3c35d3...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,8352518,9780008352516,en,2019-06-18,The Hidden Power of F*cking Up,/Hidden-Power-F-cking-Up-Try-Guys/9780008352516,The Hidden Power of F*cking Up The Try Guys de...
5,"[10, 11]","[1520, 1532]",When and how did the universe begin? Why are w...,1.0,9780553819229,bed4d5ee9d3240303ff355e4e383cb47,full/0/a/1/0a1870d010b9ad272d5b20425b56e449eac...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,553819224,9780553819229,en,2015-03-18,The Grand Design,/Grand-Design-Leonard-Mlodinow/9780553819229,The Grand Design When and how did the universe...


### transform categories

In [33]:
genre = pd.read_csv("genres.csv")
genre = genre.drop(columns="Unnamed: 0")

In [34]:
genre_without_other = genre[genre["genre"]!= "Other"]

In [35]:
genre_without_other

Unnamed: 0,category_id,category_name,class,class2,genre
0,1998,.Net Programming,6,3.0,Engineering & Programming
1,176,20th Century & Contemporary Classical Music,6,4.0,Arts & Music
2,3291,20th Century & Contemporary Classical Music,6,4.0,Arts & Music
5,1992,2D Graphics: Games Programming,6,3.0,Engineering & Programming
7,1993,3D Graphics: Games Programming,6,3.0,Engineering & Programming
...,...,...,...,...,...
2739,1999,Windows Programming,6,3.0,Engineering & Programming
2755,196,World Music,6,4.0,Arts & Music
2756,3311,World Music,6,4.0,Arts & Music
2758,346,World War 1 Fiction,6,8.0,


In [36]:
# function to translate books["categories"] into genre of book, igrnoring "Other"

In [37]:
cat_without_other = list(genre_without_other['category_id'])
cat_without_other

[1998,
 176,
 3291,
 1992,
 1993,
 1379,
 1229,
 963,
 1856,
 2491,
 1089,
 1213,
 1063,
 1172,
 1987,
 1878,
 1215,
 1035,
 1045,
 197,
 3312,
 1216,
 76,
 1054,
 921,
 2509,
 2554,
 41,
 2,
 7,
 42,
 8,
 15,
 16,
 37,
 14,
 18,
 17,
 20,
 23,
 29,
 30,
 5,
 80,
 4,
 68,
 1988,
 1179,
 219,
 217,
 229,
 1799,
 1843,
 1391,
 1166,
 1058,
 1614,
 976,
 1230,
 173,
 3288,
 3156,
 3152,
 1721,
 218,
 216,
 228,
 1446,
 1053,
 692,
 67,
 77,
 182,
 3297,
 1828,
 793,
 3338,
 977,
 979,
 1917,
 1005,
 983,
 984,
 981,
 1006,
 1004,
 980,
 2585,
 978,
 3102,
 928,
 1232,
 1339,
 1276,
 737,
 1253,
 1727,
 1048,
 2488,
 307,
 2838,
 177,
 3292,
 1080,
 1177,
 1102,
 1810,
 1154,
 353,
 2627,
 174,
 3289,
 1324,
 1040,
 13,
 1160,
 1101,
 409,
 2716,
 1802,
 1167,
 1159,
 1106,
 1170,
 1456,
 1475,
 1983,
 1243,
 1176,
 1174,
 1065,
 1066,
 987,
 1263,
 334,
 1171,
 1248,
 1239,
 742,
 186,
 3301,
 1442,
 336,
 736,
 2617,
 1191,
 1190,
 1203,
 634,
 1120,
 148,
 3263,
 1237,
 2003,
 62,
 1211

In [38]:
genre_without_other.loc[genre_without_other['category_id'] == 3386, 'genre'].values[0]

'Law & Crime'

In [39]:
def translate_categories(row):
    all_genre = []
    stripped = row.strip("[]")
    lis = [int(num) for num in stripped.split(',') if num != '']
   # print(lis)
    for i in lis: 
        if i in cat_without_other:
            value = genre_without_other.loc[genre_without_other['category_id'] == i, 'genre'].values[0]
            all_genre.append(value)
    #print(all_genre)
    if len(all_genre) == 0:
        g = "Other"
    else:
        g = stats.mode(all_genre)
    return g

    

In [40]:
translate_categories('[235, 3386]')

'Law & Crime'

In [41]:
books["Genre"] = books["categories"].apply(translate_categories)

In [42]:
books["Genre"].value_counts()

Other                             515390
Social Science and Teaching       122931
Medicine                           44421
Engineering & Programming          43302
Arts & Music                       42760
Business, Economics & Industry     38194
Law & Crime                        30525
Kids & Fiction                     25776
Name: Genre, dtype: int64

In [43]:
# create new dataframes based on each genre

In [44]:
other = books[books["Genre"]=="Other"]
len(other)

515390

In [45]:
social_and_teaching = books[books["Genre"]=="Social Science and Teaching"]
len(social_and_teaching)

122931

In [46]:
medicine = books[books["Genre"]=="Medicine"]
len(medicine)

44421

In [47]:
eng_and_prog = books[books["Genre"]=="Engineering & Programming"]
len(eng_and_prog)

43302

In [48]:
arts_and_music = books[books["Genre"]=="Arts & Music"]
len(arts_and_music)

42760

In [49]:
business = books[books["Genre"]=="Business, Economics & Industry"]
len(business)

38194

In [50]:
law_and_crime = books[books["Genre"]=="Law & Crime"]
len(law_and_crime)

30525

In [51]:
kids_and_fiction = books[books["Genre"]=="Kids & Fiction"]
len(kids_and_fiction)

25776

In [64]:
def get_wordnet_pos(token):

    tag = nltk.pos_tag([token], lang="eng")[0][1][0].upper()
    tag_dict = {"N": wordnet.NOUN,
                "J": wordnet.ADJ,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

In [65]:
# tokenizer, remove punctuation, lower case

def tokenizer_and_remove_punctuation(row):

    tokens = word_tokenize(row["text"])

    return [token.lower() for token in tokens if token.isalpha()]

In [66]:
lm = WordNetLemmatizer()

def lemmatizer_with_pos(row):

    return [lm.lemmatize(token, get_wordnet_pos(token)) for token in row["tokenized"]]

In [67]:
def remove_sw(row):
    return list(set(row["lemmatized"]).difference(stopwords.words()))

In [68]:
# put all together into a cave man language

def re_blob(row):
    return " ".join(row["no_stopwords"])

### NLP for Social Science and Teaching

In [69]:
text = pd.DataFrame(social_and_teaching["text"], columns=["text"])
text

Unnamed: 0,text
29,"The Future of Humanity : Terraforming Mars, In..."
39,Have I Got News For You: Guide to Modern Brita...
44,"The Second Machine Age : Work, Progress, and P..."
66,The Gendered Brain : The new neuroscience that...
78,Hacking Darwin : Genetic Engineering and the F...
...,...
1108891,My Awesome Explorer Field Guide : The Practica...
1108942,When Camp Onanda Gives Her Call : Camp History...
1109041,The Complete Golfer This scarce antiquarian bo...
1109198,2019 Planner : Weekly Planner & Monthly Calend...


In [70]:
text["tokenized"] =  text.apply(tokenizer_and_remove_punctuation, axis=1)

In [71]:
# lemmatize with part of speach tags

lm = WordNetLemmatizer()


text["lemmatized"] = text.apply(lemmatizer_with_pos, axis=1)

In [72]:
# remove stopwords

text["no_stopwords"] = text.apply(remove_sw, axis=1)

In [73]:
# put all together into a cave man language


text["clean_blob"] =  text.apply(re_blob, axis = 1)

In [74]:
# create our bag of words model to then be able to apply ML algorithms

bow_vct = CountVectorizer(max_features=10000)

# train it

bow_vct.fit(text["clean_blob"])

CountVectorizer(max_features=10000)

In [75]:
X = bow_vct.transform(text["clean_blob"]).toarray()

In [76]:
social_and_teaching_df = pd.DataFrame(X, columns=bow_vct.get_feature_names_out())
#social_and_teaching_df.to_csv("ssat_words.csv")
#social_and_teaching_df.to_csv("ssat_words.csv")

In [77]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=5)

km.fit(social_and_teaching_df)

KMeans(n_clusters=5)

In [53]:
pickle.dump(bow_vct, open("bv_ssat.p", "wb"))
pickle.dump(km, open("km_ssat.p", "wb"))

In [78]:
pred = km.predict(social_and_teaching_df)

In [79]:
social_and_teaching["class"] = pred

social_and_teaching.to_pickle("ssat_pred.p")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  social_and_teaching["class"] = pred


### NLP for Medicine

In [80]:
text = pd.DataFrame(medicine["text"], columns=["text"])

In [81]:
text["tokenized"] =  text.apply(tokenizer_and_remove_punctuation, axis=1)

In [82]:
text["lemmatized"] = text.apply(lemmatizer_with_pos, axis=1)

In [83]:
text["no_stopwords"] = text.apply(remove_sw, axis=1)

In [84]:
text["clean_blob"] =  text.apply(re_blob, axis = 1)

In [85]:
bow_vct = CountVectorizer(max_features=10000)

bow_vct.fit(text["clean_blob"])

CountVectorizer(max_features=10000)

In [86]:
X = bow_vct.transform(text["clean_blob"]).toarray()

In [87]:
medicine_df = pd.DataFrame(X, columns=bow_vct.get_feature_names_out())
#medicine_df.to_csv("med_words.csv")

In [88]:
km = KMeans(n_clusters=5)

km.fit(medicine_df)

KMeans(n_clusters=5)

In [74]:
pickle.dump(bow_vct, open("bv_med.p", "wb"))
pickle.dump(km, open("km_med.p", "wb"))

In [89]:
pred = km.predict(medicine_df)

In [90]:
medicine["class"]=pred

medicine.to_pickle("med_pred.p")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  medicine["class"]=pred


### NLP for Engineering and Programming

In [91]:
text = pd.DataFrame(eng_and_prog["text"], columns=["text"])

In [92]:
text["tokenized"] =  text.apply(tokenizer_and_remove_punctuation, axis=1)

In [93]:
text["lemmatized"] = text.apply(lemmatizer_with_pos, axis=1)

In [94]:
text["no_stopwords"] = text.apply(remove_sw, axis=1)

In [95]:
text["clean_blob"] =  text.apply(re_blob, axis = 1)

In [96]:
bow_vct = CountVectorizer(max_features=10000)

bow_vct.fit(text["clean_blob"])

CountVectorizer(max_features=10000)

In [97]:
X = bow_vct.transform(text["clean_blob"]).toarray()

In [98]:
eng_and_prog_df = pd.DataFrame(X, columns=bow_vct.get_feature_names_out())
#eng_and_prog_df.to_csv("ep_words.csv")

In [99]:
km = KMeans(n_clusters=5)

km.fit(eng_and_prog_df)

KMeans(n_clusters=5)

In [86]:
pickle.dump(bow_vct, open("bv_ep.p", "wb"))
pickle.dump(km, open("km_ep.p", "wb"))

In [100]:
pred = km.predict(eng_and_prog_df)

In [101]:
eng_and_prog["class"] = pred

eng_and_prog.to_pickle("ep_pred.p")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eng_and_prog["class"] = pred


### NLP for Arts & Music

In [102]:
text = pd.DataFrame(arts_and_music["text"], columns=["text"])

In [103]:
text["tokenized"] =  text.apply(tokenizer_and_remove_punctuation, axis=1)

In [104]:
text["lemmatized"] = text.apply(lemmatizer_with_pos, axis=1)

In [105]:
text["no_stopwords"] = text.apply(remove_sw, axis=1)

In [106]:
text["clean_blob"] =  text.apply(re_blob, axis = 1)

In [107]:
bow_vct = CountVectorizer(max_features=10000)

bow_vct.fit(text["clean_blob"])

CountVectorizer(max_features=10000)

In [108]:
X = bow_vct.transform(text["clean_blob"]).toarray()

In [109]:
art_music_df = pd.DataFrame(X, columns=bow_vct.get_feature_names_out())
#art_music_df.to_csv("am_words.csv")

In [110]:
km = KMeans(n_clusters=5)

km.fit(art_music_df)

KMeans(n_clusters=5)

In [98]:
pickle.dump(bow_vct, open("bv_am.p", "wb"))
pickle.dump(km, open("km_am.p", "wb"))

In [111]:
pred = km.predict(art_music_df)

In [112]:
arts_and_music["class"] = pred

arts_and_music.to_pickle("am_pred.p")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  arts_and_music["class"] = pred


### NLP for Business, Economics & Industry

In [113]:
text = pd.DataFrame(business["text"], columns=["text"])

In [114]:
text["tokenized"] =  text.apply(tokenizer_and_remove_punctuation, axis=1)

In [115]:
text["lemmatized"] = text.apply(lemmatizer_with_pos, axis=1)

In [116]:
text["no_stopwords"] = text.apply(remove_sw, axis=1)

In [117]:
text["clean_blob"] =  text.apply(re_blob, axis = 1)

In [118]:
bow_vct = CountVectorizer(max_features=10000)

bow_vct.fit(text["clean_blob"])

CountVectorizer(max_features=10000)

In [119]:
X = bow_vct.transform(text["clean_blob"]).toarray()

In [120]:
buisness_df = pd.DataFrame(X, columns=bow_vct.get_feature_names_out())
#business_df.to_csv("business_words.csv")

In [121]:
km = KMeans(n_clusters=5)

km.fit(buisness_df)

KMeans(n_clusters=5)

In [110]:
pickle.dump(bow_vct, open("bv_be.p", "wb"))
pickle.dump(km, open("km_be.p", "wb"))

In [122]:
pred = km.predict(buisness_df)

In [123]:
business["class"] = pred

business.to_pickle("be_pred.p")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  business["class"] = pred


### NLP for Law & Crime

In [124]:
text = pd.DataFrame(law_and_crime["text"], columns=["text"])

In [125]:
text["tokenized"] =  text.apply(tokenizer_and_remove_punctuation, axis=1)

In [126]:
text["lemmatized"] = text.apply(lemmatizer_with_pos, axis=1)

In [127]:
text["no_stopwords"] = text.apply(remove_sw, axis=1)

In [128]:
text["clean_blob"] =  text.apply(re_blob, axis = 1)

In [129]:
bow_vct = CountVectorizer(max_features=10000)

bow_vct.fit(text["clean_blob"])

CountVectorizer(max_features=10000)

In [130]:
X = bow_vct.transform(text["clean_blob"]).toarray()

In [131]:
law_df = pd.DataFrame(X, columns=bow_vct.get_feature_names_out())
#law_df.to_csv("lc_words.csv")

In [132]:
km = KMeans(n_clusters=5)

km.fit(law_df)

KMeans(n_clusters=5)

In [122]:
pickle.dump(bow_vct, open("bv_lc.p", "wb"))
pickle.dump(km, open("km_lc.p", "wb"))

In [133]:
pred = km.predict(law_df)

In [134]:
law_and_crime["class"] = pred

law_and_crime.to_pickle("lc_pred.p")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  law_and_crime["class"] = pred


### NLP for Kids & Fiction

In [135]:
text = pd.DataFrame(kids_and_fiction["text"], columns=["text"])

In [136]:
text["tokenized"] =  text.apply(tokenizer_and_remove_punctuation, axis=1)

In [137]:
text["lemmatized"] = text.apply(lemmatizer_with_pos, axis=1)

In [138]:
text["no_stopwords"] = text.apply(remove_sw, axis=1)

In [139]:
text["clean_blob"] =  text.apply(re_blob, axis = 1)

In [140]:
bow_vct = CountVectorizer(max_features=10000)

bow_vct.fit(text["clean_blob"])

CountVectorizer(max_features=10000)

In [141]:
X = bow_vct.transform(text["clean_blob"]).toarray()

In [142]:
kids_fiction_df = pd.DataFrame(X, columns=bow_vct.get_feature_names_out())
#kids_fiction_df.to_csv("kf_words.csv")

In [143]:
km = KMeans(n_clusters=5)

km.fit(kids_fiction_df)

KMeans(n_clusters=5)

In [134]:
pickle.dump(bow_vct, open("bv_kf.p", "wb"))
pickle.dump(km, open("km_kf.p", "wb"))

In [144]:
pred = km.predict(kids_fiction_df)

In [145]:
kids_and_fiction["class"] = pred

kids_and_fiction.to_pickle("kf_pred.p")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kids_and_fiction["class"] = pred


### NLP for Other

In [40]:
text = pd.DataFrame(other["text"], columns=["text"])

In [None]:
text["tokenized"] =  text.apply(tokenizer_and_remove_punctuation, axis=1)

In [None]:
text["lemmatized"] = text.apply(lemmatizer_with_pos, axis=1)


In [None]:
text.to_csv("other_lemmatized.cvs")

In [81]:
text = pd.read_csv("other_lemmatized.cvs").drop(columns="Unnamed: 0")

  text = pd.read_csv("other_lemmatized.cvs").drop(columns="Unnamed: 0")


In [85]:
text
text.drop_duplicates(subset ="text",
                     keep = False, inplace = True)

In [86]:
text

Unnamed: 0,text,tokenized,lemmatized
0,Soldier Five : The Real Truth About The Bravo ...,"['soldier', 'five', 'the', 'real', 'truth', 'a...","['soldier', 'five', 'the', 'real', 'truth', 'a..."
2,The Hidden Power of F*cking Up The Try Guys de...,"['the', 'hidden', 'power', 'of', 'f', 'cking',...","['the', 'hidden', 'power', 'of', 'f', 'cking',..."
3,The Grand Design When and how did the universe...,"['the', 'grand', 'design', 'when', 'and', 'how...","['the', 'grand', 'design', 'when', 'and', 'how..."
4,"1,227 QI Facts To Blow Your Socks Off Did you ...","['qi', 'facts', 'to', 'blow', 'your', 'socks',...","['qi', 'fact', 'to', 'blow', 'your', 'sock', '..."
5,The 100 Most Pointless Arguments in the World ...,"['the', 'most', 'pointless', 'arguments', 'in'...","['the', 'most', 'pointless', 'argument', 'in',..."
...,...,...,...
1669925,"Backpacking California : Mountain, Foothill, C...","['backpacking', 'california', 'mountain', 'foo...","['backpacking', 'california', 'mountain', 'foo..."
1669926,"2020 : Diary, Weekly Planner, Organiser, Year ...","['diary', 'weekly', 'planner', 'organiser', 'y...","['diary', 'weekly', 'planner', 'organiser', 'y..."
1669927,Canoeing and Camping This special re-print edi...,"['canoeing', 'and', 'camping', 'this', 'specia...","['canoe', 'and', 'camp', 'this', 'special', 'e..."
1669928,The Complete Guide to Western Horsemanship (Cl...,"['the', 'complete', 'guide', 'to', 'western', ...","['the', 'complete', 'guide', 'to', 'western', ..."


In [40]:
def fix_lemmatized(row):
    fixed = row.strip("[]").replace("'", "").replace(",", "").split()
    return fixed

In [55]:
other.drop_duplicates(subset ="text",
                     keep = False, inplace = True)
other

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  other.drop_duplicates(subset ="text",


Unnamed: 0,authors,categories,description,format,id,image-checksum,image-path,image-url,isbn10,isbn13,lang,publication-date,title,url,text,Genre
0,[1],"[214, 220, 237, 2646, 2647, 2659, 2660, 2679]",SOLDIER FIVE is an elite soldier's explosive m...,1.0,9781840189070,97c8e71f2ec114b34f243074d2091077,full/c/5/2/c529152ea1246c0cb17d6574d302eae6d2e...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,184018907X,9781840189070,en,2004-10-14,Soldier Five : The Real Truth About The Bravo ...,/Soldier-Five-Mike-Coburn/9781840189070,Soldier Five : The Real Truth About The Bravo ...,Other
4,[9],"[2813, 2980]",The Try Guys deliver their first book-an inspi...,2.0,9780008352516,39c0422c00be90fb9258dd6df9068b1e,full/e/e/a/eea0ff6b6f1882d1862c711a2008e3c35d3...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,8352518,9780008352516,en,2019-06-18,The Hidden Power of F*cking Up,/Hidden-Power-F-cking-Up-Try-Guys/9780008352516,The Hidden Power of F*cking Up The Try Guys de...,Other
5,"[10, 11]","[1520, 1532]",When and how did the universe begin? Why are w...,1.0,9780553819229,bed4d5ee9d3240303ff355e4e383cb47,full/0/a/1/0a1870d010b9ad272d5b20425b56e449eac...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,553819224,9780553819229,en,2015-03-18,The Grand Design,/Grand-Design-Leonard-Mlodinow/9780553819229,The Grand Design When and how did the universe...,Other
6,"[6, 7, 8]","[377, 2978, 2980]",Did you know that: cows moo in regional accent...,1.0,9780571297931,d04552f52c16c90db5511e4f76e78694,full/7/5/3/753715be3dcda52b44ac9b14bce4a5258ec...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,571297935,9780571297931,en,2016-05-05,"1,227 QI Facts To Blow Your Socks Off",/1-227-QI-Facts-Blow-Your-Socks-Off-John-Lloyd...,"1,227 QI Facts To Blow Your Socks Off Did you ...",Other
7,"[12, 13]",[2980],"We've all had them, those pointless arguments ...",1.0,9781444762082,4be783fe835efc596d95732878612663,full/9/2/a/92a8415be732a03c7e7c2fbe4085edfcec5...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,1444762087,9781444762082,en,2014-05-08,The 100 Most Pointless Arguments in the World ...,/100-Most-Pointless-Arguments-World-Alexander-...,The 100 Most Pointless Arguments in the World ...,Other
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1109369,[483284],"[2770, 3092, 3100, 3101]",Prepare for a Lifetime of Awe-Inspiring Advent...,1.0,9780899979588,8bdeb6c67d511cf28f3bc43ed27d43de,full/6/2/e/62e74b7b263f06fe57f053521c7a958740e...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,899979580,9780899979588,en,2020-11-12,"Backpacking California : Mountain, Foothill, C...",/Backpacking-California-Wilderness-Press/97808...,"Backpacking California : Mountain, Foothill, C...",Other
1109379,[336369],"[3045, 3054, 3068, 3086]",This 2020 Diary/Planner has one week per 2 pag...,1.0,9781711791968,d9f083c9c171c71b1df27f6dc2d66ba1,full/4/d/4/4d4766e368dc599bf375c320460042c36b8...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,1711791962,9781711791968,en,2019-11-25,"2020 : Diary, Weekly Planner, Organiser, Year ...",/2020-Annie-Mac-Journals/9781711791968,"2020 : Diary, Weekly Planner, Organiser, Year ...",Other
1109380,"[29792, 654019]",[3096],This special re-print edition of James A. Crui...,1.0,9781792858017,98d28c0af6a2b740ba009490a30d17ed,full/9/e/7/9e7db64b6b3eb2626731745beb513c2a3db...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,1792858019,9781792858017,en,2018-12-28,Canoeing and Camping,/Canoeing-Camping-James-Cruikshank/9781792858017,Canoeing and Camping This special re-print edi...,Other
1109381,[654020],"[3054, 3056]",Good western riding doesn't just happen. J.P. ...,2.0,9780876059821,cafe5abe83e39958c776c1f7fb2d93e6,full/d/2/c/d2c178a62b1e98158abdb695678ae8b8238...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,876059825,9780876059821,en,1995-04-12,The Complete Guide to Western Horsemanship (Cl...,/Complete-Guide-Western-Horsemanship-Cloth-For...,The Complete Guide to Western Horsemanship (Cl...,Other


In [114]:
text["lemmatized"] = text["lemmatized"].astype('str').apply(fix_lemmatized)

In [115]:
text["no_stopwords"] = text.apply(remove_sw, axis=1)

In [116]:
text.to_csv("other_nostopwords.cvs")

In [42]:
text = pd.read_csv("other_nostopwords.cvs").drop(columns="Unnamed: 0")

In [43]:
text["no_stopwords"] = text["no_stopwords"].astype('str').apply(fix_lemmatized)

In [37]:
len(text)

458663

In [38]:
len(other)

458663

In [44]:
text["clean_blob"] =  text.apply(re_blob, axis = 1)

In [45]:
bow_vct = CountVectorizer(max_features=10000)

bow_vct.fit(text["clean_blob"])

CountVectorizer(max_features=10000)

In [46]:
X = bow_vct.transform(text["clean_blob"]).toarray()

In [47]:
other_df = pd.DataFrame(X, columns=bow_vct.get_feature_names_out())

In [48]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=5)

km.fit(other_df.sample(n=100000))

KMeans(n_clusters=5)

In [49]:
pickle.dump(bow_vct, open("bv_o.p", "wb"))
pickle.dump(km, open("km_o.p", "wb"))

In [50]:
pred = km.predict(other_df)

In [None]:
other['class'] = pred



In [83]:
other.to_pickle("oth_pred2.p")

In [88]:
pd.read_csv("oth_pred2.csv", low_memory=True)

  pd.read_csv("oth_pred2.csv", low_memory=True)


Unnamed: 0.1,Unnamed: 0,authors,categories,description,format,id,image-checksum,image-path,image-url,isbn10,isbn13,lang,publication-date,title,url,text,Genre,class
0,0,[1],"[214, 220, 237, 2646, 2647, 2659, 2660, 2679]",SOLDIER FIVE is an elite soldier's explosive m...,1.0,9781840189070,97c8e71f2ec114b34f243074d2091077,full/c/5/2/c529152ea1246c0cb17d6574d302eae6d2e...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,184018907X,9781840189070,en,2004-10-14,Soldier Five : The Real Truth About The Bravo ...,/Soldier-Five-Mike-Coburn/9781840189070,Soldier Five : The Real Truth About The Bravo ...,Other,1.0
1,4,[9],"[2813, 2980]",The Try Guys deliver their first book-an inspi...,2.0,9780008352516,39c0422c00be90fb9258dd6df9068b1e,full/e/e/a/eea0ff6b6f1882d1862c711a2008e3c35d3...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,8352518,9780008352516,en,2019-06-18,The Hidden Power of F*cking Up,/Hidden-Power-F-cking-Up-Try-Guys/9780008352516,The Hidden Power of F*cking Up The Try Guys de...,Other,4.0
2,5,"[10, 11]","[1520, 1532]",When and how did the universe begin? Why are w...,1.0,9780553819229,bed4d5ee9d3240303ff355e4e383cb47,full/0/a/1/0a1870d010b9ad272d5b20425b56e449eac...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,553819224,9780553819229,en,2015-03-18,The Grand Design,/Grand-Design-Leonard-Mlodinow/9780553819229,The Grand Design When and how did the universe...,Other,4.0
3,6,"[6, 7, 8]","[377, 2978, 2980]",Did you know that: cows moo in regional accent...,1.0,9780571297931,d04552f52c16c90db5511e4f76e78694,full/7/5/3/753715be3dcda52b44ac9b14bce4a5258ec...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,571297935,9780571297931,en,2016-05-05,"1,227 QI Facts To Blow Your Socks Off",/1-227-QI-Facts-Blow-Your-Socks-Off-John-Lloyd...,"1,227 QI Facts To Blow Your Socks Off Did you ...",Other,4.0
4,7,"[12, 13]",[2980],"We've all had them, those pointless arguments ...",1.0,9781444762082,4be783fe835efc596d95732878612663,full/9/2/a/92a8415be732a03c7e7c2fbe4085edfcec5...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,1444762087,9781444762082,en,2014-05-08,The 100 Most Pointless Arguments in the World ...,/100-Most-Pointless-Arguments-World-Alexander-...,The 100 Most Pointless Arguments in the World ...,Other,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1253345,1109369,[483284],"[2770, 3092, 3100, 3101]",Prepare for a Lifetime of Awe-Inspiring Advent...,1.0,9780899979588,8bdeb6c67d511cf28f3bc43ed27d43de,full/6/2/e/62e74b7b263f06fe57f053521c7a958740e...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,899979580,9780899979588,en,2020-11-12,"Backpacking California : Mountain, Foothill, C...",/Backpacking-California-Wilderness-Press/97808...,"Backpacking California : Mountain, Foothill, C...",Other,4.0
1253346,1109379,[336369],"[3045, 3054, 3068, 3086]",This 2020 Diary/Planner has one week per 2 pag...,1.0,9781711791968,d9f083c9c171c71b1df27f6dc2d66ba1,full/4/d/4/4d4766e368dc599bf375c320460042c36b8...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,1711791962,9781711791968,en,2019-11-25,"2020 : Diary, Weekly Planner, Organiser, Year ...",/2020-Annie-Mac-Journals/9781711791968,"2020 : Diary, Weekly Planner, Organiser, Year ...",Other,4.0
1253347,1109380,"[29792, 654019]",[3096],This special re-print edition of James A. Crui...,1.0,9781792858017,98d28c0af6a2b740ba009490a30d17ed,full/9/e/7/9e7db64b6b3eb2626731745beb513c2a3db...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,1792858019,9781792858017,en,2018-12-28,Canoeing and Camping,/Canoeing-Camping-James-Cruikshank/9781792858017,Canoeing and Camping This special re-print edi...,Other,4.0
1253348,1109381,[654020],"[3054, 3056]",Good western riding doesn't just happen. J.P. ...,2.0,9780876059821,cafe5abe83e39958c776c1f7fb2d93e6,full/d/2/c/d2c178a62b1e98158abdb695678ae8b8238...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,876059825,9780876059821,en,1995-04-12,The Complete Guide to Western Horsemanship (Cl...,/Complete-Guide-Western-Horsemanship-Cloth-For...,The Complete Guide to Western Horsemanship (Cl...,Other,4.0


In [85]:
pd.read_pickle('oth_pred2.p').dtypes

authors                     object
categories                  object
description                 object
format                     float64
id                           int64
image-checksum              object
image-path                  object
image-url                   object
isbn10                      object
isbn13                       int64
lang                        object
publication-date    datetime64[ns]
title                       object
url                         object
text                        object
Genre                       object
class                        int32
dtype: object

In [51]:
o_pred = other.join(pd.DataFrame(pred, columns=["class"]))

o_pred.to_csv("oth_pred.csv")

### other


In [5]:

def re_blob(row):
    return " ".join(row["no_stopwords"])

In [7]:
def fix_lemmatized(row):
    fixed = row.strip("[]").replace("'", "").replace(",", "").split()
    return fixed

In [6]:
text = pd.read_csv("other_nostopwords.cvs").drop(columns="Unnamed: 0")

In [8]:
text["no_stopwords"] = text["no_stopwords"].astype('str').apply(fix_lemmatized)

In [10]:
text["clean_blob"] =  text.apply(re_blob, axis = 1)

In [11]:
bow_vct = CountVectorizer(max_features=10000)

bow_vct.fit(text["clean_blob"])

CountVectorizer(max_features=10000)

In [12]:
X = bow_vct.transform(text["clean_blob"]).toarray()


In [14]:
other_df = pd.DataFrame(X, columns=bow_vct.get_feature_names_out())

In [15]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=5)

km.fit(other_df.sample(n=100000))

KMeans(n_clusters=5)

In [16]:
pred = km.predict(other_df)

In [58]:
other['class'] = pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  other['class'] = pred


In [59]:
other

Unnamed: 0,authors,categories,description,format,id,image-checksum,image-path,image-url,isbn10,isbn13,lang,publication-date,title,url,text,Genre,class
0,[1],"[214, 220, 237, 2646, 2647, 2659, 2660, 2679]",SOLDIER FIVE is an elite soldier's explosive m...,1.0,9781840189070,97c8e71f2ec114b34f243074d2091077,full/c/5/2/c529152ea1246c0cb17d6574d302eae6d2e...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,184018907X,9781840189070,en,2004-10-14,Soldier Five : The Real Truth About The Bravo ...,/Soldier-Five-Mike-Coburn/9781840189070,Soldier Five : The Real Truth About The Bravo ...,Other,3
4,[9],"[2813, 2980]",The Try Guys deliver their first book-an inspi...,2.0,9780008352516,39c0422c00be90fb9258dd6df9068b1e,full/e/e/a/eea0ff6b6f1882d1862c711a2008e3c35d3...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,8352518,9780008352516,en,2019-06-18,The Hidden Power of F*cking Up,/Hidden-Power-F-cking-Up-Try-Guys/9780008352516,The Hidden Power of F*cking Up The Try Guys de...,Other,0
5,"[10, 11]","[1520, 1532]",When and how did the universe begin? Why are w...,1.0,9780553819229,bed4d5ee9d3240303ff355e4e383cb47,full/0/a/1/0a1870d010b9ad272d5b20425b56e449eac...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,553819224,9780553819229,en,2015-03-18,The Grand Design,/Grand-Design-Leonard-Mlodinow/9780553819229,The Grand Design When and how did the universe...,Other,0
6,"[6, 7, 8]","[377, 2978, 2980]",Did you know that: cows moo in regional accent...,1.0,9780571297931,d04552f52c16c90db5511e4f76e78694,full/7/5/3/753715be3dcda52b44ac9b14bce4a5258ec...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,571297935,9780571297931,en,2016-05-05,"1,227 QI Facts To Blow Your Socks Off",/1-227-QI-Facts-Blow-Your-Socks-Off-John-Lloyd...,"1,227 QI Facts To Blow Your Socks Off Did you ...",Other,0
7,"[12, 13]",[2980],"We've all had them, those pointless arguments ...",1.0,9781444762082,4be783fe835efc596d95732878612663,full/9/2/a/92a8415be732a03c7e7c2fbe4085edfcec5...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,1444762087,9781444762082,en,2014-05-08,The 100 Most Pointless Arguments in the World ...,/100-Most-Pointless-Arguments-World-Alexander-...,The 100 Most Pointless Arguments in the World ...,Other,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1109369,[483284],"[2770, 3092, 3100, 3101]",Prepare for a Lifetime of Awe-Inspiring Advent...,1.0,9780899979588,8bdeb6c67d511cf28f3bc43ed27d43de,full/6/2/e/62e74b7b263f06fe57f053521c7a958740e...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,899979580,9780899979588,en,2020-11-12,"Backpacking California : Mountain, Foothill, C...",/Backpacking-California-Wilderness-Press/97808...,"Backpacking California : Mountain, Foothill, C...",Other,0
1109379,[336369],"[3045, 3054, 3068, 3086]",This 2020 Diary/Planner has one week per 2 pag...,1.0,9781711791968,d9f083c9c171c71b1df27f6dc2d66ba1,full/4/d/4/4d4766e368dc599bf375c320460042c36b8...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,1711791962,9781711791968,en,2019-11-25,"2020 : Diary, Weekly Planner, Organiser, Year ...",/2020-Annie-Mac-Journals/9781711791968,"2020 : Diary, Weekly Planner, Organiser, Year ...",Other,0
1109380,"[29792, 654019]",[3096],This special re-print edition of James A. Crui...,1.0,9781792858017,98d28c0af6a2b740ba009490a30d17ed,full/9/e/7/9e7db64b6b3eb2626731745beb513c2a3db...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,1792858019,9781792858017,en,2018-12-28,Canoeing and Camping,/Canoeing-Camping-James-Cruikshank/9781792858017,Canoeing and Camping This special re-print edi...,Other,0
1109381,[654020],"[3054, 3056]",Good western riding doesn't just happen. J.P. ...,2.0,9780876059821,cafe5abe83e39958c776c1f7fb2d93e6,full/d/2/c/d2c178a62b1e98158abdb695678ae8b8238...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,876059825,9780876059821,en,1995-04-12,The Complete Guide to Western Horsemanship (Cl...,/Complete-Guide-Western-Horsemanship-Cloth-For...,The Complete Guide to Western Horsemanship (Cl...,Other,0


In [60]:
other.to_pickle("oth_pred.p")

In [61]:
pd.read_pickle("oth_pred.p")

Unnamed: 0,authors,categories,description,format,id,image-checksum,image-path,image-url,isbn10,isbn13,lang,publication-date,title,url,text,Genre,class
0,[1],"[214, 220, 237, 2646, 2647, 2659, 2660, 2679]",SOLDIER FIVE is an elite soldier's explosive m...,1.0,9781840189070,97c8e71f2ec114b34f243074d2091077,full/c/5/2/c529152ea1246c0cb17d6574d302eae6d2e...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,184018907X,9781840189070,en,2004-10-14,Soldier Five : The Real Truth About The Bravo ...,/Soldier-Five-Mike-Coburn/9781840189070,Soldier Five : The Real Truth About The Bravo ...,Other,3
4,[9],"[2813, 2980]",The Try Guys deliver their first book-an inspi...,2.0,9780008352516,39c0422c00be90fb9258dd6df9068b1e,full/e/e/a/eea0ff6b6f1882d1862c711a2008e3c35d3...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,8352518,9780008352516,en,2019-06-18,The Hidden Power of F*cking Up,/Hidden-Power-F-cking-Up-Try-Guys/9780008352516,The Hidden Power of F*cking Up The Try Guys de...,Other,0
5,"[10, 11]","[1520, 1532]",When and how did the universe begin? Why are w...,1.0,9780553819229,bed4d5ee9d3240303ff355e4e383cb47,full/0/a/1/0a1870d010b9ad272d5b20425b56e449eac...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,553819224,9780553819229,en,2015-03-18,The Grand Design,/Grand-Design-Leonard-Mlodinow/9780553819229,The Grand Design When and how did the universe...,Other,0
6,"[6, 7, 8]","[377, 2978, 2980]",Did you know that: cows moo in regional accent...,1.0,9780571297931,d04552f52c16c90db5511e4f76e78694,full/7/5/3/753715be3dcda52b44ac9b14bce4a5258ec...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,571297935,9780571297931,en,2016-05-05,"1,227 QI Facts To Blow Your Socks Off",/1-227-QI-Facts-Blow-Your-Socks-Off-John-Lloyd...,"1,227 QI Facts To Blow Your Socks Off Did you ...",Other,0
7,"[12, 13]",[2980],"We've all had them, those pointless arguments ...",1.0,9781444762082,4be783fe835efc596d95732878612663,full/9/2/a/92a8415be732a03c7e7c2fbe4085edfcec5...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,1444762087,9781444762082,en,2014-05-08,The 100 Most Pointless Arguments in the World ...,/100-Most-Pointless-Arguments-World-Alexander-...,The 100 Most Pointless Arguments in the World ...,Other,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1109369,[483284],"[2770, 3092, 3100, 3101]",Prepare for a Lifetime of Awe-Inspiring Advent...,1.0,9780899979588,8bdeb6c67d511cf28f3bc43ed27d43de,full/6/2/e/62e74b7b263f06fe57f053521c7a958740e...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,899979580,9780899979588,en,2020-11-12,"Backpacking California : Mountain, Foothill, C...",/Backpacking-California-Wilderness-Press/97808...,"Backpacking California : Mountain, Foothill, C...",Other,0
1109379,[336369],"[3045, 3054, 3068, 3086]",This 2020 Diary/Planner has one week per 2 pag...,1.0,9781711791968,d9f083c9c171c71b1df27f6dc2d66ba1,full/4/d/4/4d4766e368dc599bf375c320460042c36b8...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,1711791962,9781711791968,en,2019-11-25,"2020 : Diary, Weekly Planner, Organiser, Year ...",/2020-Annie-Mac-Journals/9781711791968,"2020 : Diary, Weekly Planner, Organiser, Year ...",Other,0
1109380,"[29792, 654019]",[3096],This special re-print edition of James A. Crui...,1.0,9781792858017,98d28c0af6a2b740ba009490a30d17ed,full/9/e/7/9e7db64b6b3eb2626731745beb513c2a3db...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,1792858019,9781792858017,en,2018-12-28,Canoeing and Camping,/Canoeing-Camping-James-Cruikshank/9781792858017,Canoeing and Camping This special re-print edi...,Other,0
1109381,[654020],"[3054, 3056]",Good western riding doesn't just happen. J.P. ...,2.0,9780876059821,cafe5abe83e39958c776c1f7fb2d93e6,full/d/2/c/d2c178a62b1e98158abdb695678ae8b8238...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,876059825,9780876059821,en,1995-04-12,The Complete Guide to Western Horsemanship (Cl...,/Complete-Guide-Western-Horsemanship-Cloth-For...,The Complete Guide to Western Horsemanship (Cl...,Other,0


### Social Science and Teaching

In [None]:
text = pd.read_csv("other_nostopwords.cvs").drop(columns="Unnamed: 0")

In [None]:
text["no_stopwords"] = text["no_stopwords"].astype('str').apply(fix_lemmatized)


In [None]:
text["clean_blob"] =  text.apply(re_blob, axis = 1)

In [None]:
bow_vct = CountVectorizer(max_features=10000)

bow_vct.fit(text["clean_blob"])

In [None]:
X = bow_vct.transform(text["clean_blob"]).toarray()