In [3]:
import pandas as pd

In [10]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet

In [4]:
genre = pd.read_csv("categories.csv")

In [5]:
genre

Unnamed: 0,category_id,category_name
0,1998,.Net Programming
1,176,20th Century & Contemporary Classical Music
2,3291,20th Century & Contemporary Classical Music
3,2659,20th Century History: C 1900 To C 2000
4,2661,21st Century History: From C 2000 -
...,...,...
2770,1634,Zoology: Invertebrates
2771,1644,Zoology: Mammals
2772,1639,Zoology: Vertebrates
2773,3007,Zoos & Wildlife Parks


In [6]:
genre.isna().sum()

category_id      0
category_name    0
dtype: int64

### NLP on category_name

In [8]:
text = pd.DataFrame(genre["category_name"], columns=["category_name"])
text

Unnamed: 0,category_name
0,.Net Programming
1,20th Century & Contemporary Classical Music
2,20th Century & Contemporary Classical Music
3,20th Century History: C 1900 To C 2000
4,21st Century History: From C 2000 -
...,...
2770,Zoology: Invertebrates
2771,Zoology: Mammals
2772,Zoology: Vertebrates
2773,Zoos & Wildlife Parks


In [11]:
# tokenizer, remove punctuation, lower case

def tokenizer_and_remove_punctuation(row):

    tokens = word_tokenize(row["category_name"])

    return [token.lower() for token in tokens]

text["tokenized"] =  text.apply(tokenizer_and_remove_punctuation, axis=1)

In [13]:
def get_wordnet_pos(token):

    tag = nltk.pos_tag([token], lang="eng")[0][1][0].upper()
    tag_dict = {"N": wordnet.NOUN,
                "J": wordnet.ADJ,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

In [14]:
# lemmatize with part of speach tags

lm = WordNetLemmatizer()

def lemmatizer_with_pos(row):

    return [lm.lemmatize(token, get_wordnet_pos(token)) for token in row["tokenized"]]


text["lemmatized"] = text.apply(lemmatizer_with_pos, axis=1)

In [16]:
# remove stopwords

def remove_sw(row):
    return list(set(row["lemmatized"]).difference(stopwords.words()))

text["no_stopwords"] = text.apply(remove_sw, axis=1)

In [18]:
# put all together into a cave man language

def re_blob(row):
    return " ".join(row["no_stopwords"])

text["clean_blob"] =  text.apply(re_blob, axis = 1)

In [20]:
# create our bag of words model to then be able to apply ML algorithms

bow_vct = CountVectorizer()

# train it

bow_vct.fit(text["clean_blob"])

CountVectorizer()

In [21]:
text.head()

Unnamed: 0,category_name,tokenized,lemmatized,no_stopwords,clean_blob
0,.Net Programming,"[.net, programming]","[.net, program]","[.net, program]",.net program
1,20th Century & Contemporary Classical Music,"[20th, century, &, contemporary, classical, mu...","[20th, century, &, contemporary, classical, mu...","[&, music, classical, contemporary, 20th, cent...",& music classical contemporary 20th century
2,20th Century & Contemporary Classical Music,"[20th, century, &, contemporary, classical, mu...","[20th, century, &, contemporary, classical, mu...","[&, music, classical, contemporary, 20th, cent...",& music classical contemporary 20th century
3,20th Century History: C 1900 To C 2000,"[20th, century, history, :, c, 1900, to, c, 2000]","[20th, century, history, :, c, 1900, to, c, 2000]","[1900, :, century, history, 2000, 20th]",1900 : century history 2000 20th
4,21st Century History: From C 2000 -,"[21st, century, history, :, from, c, 2000, -]","[21st, century, history, :, from, c, 2000, -]","[21st, :, -, century, history, 2000]",21st : - century history 2000


In [22]:
X = bow_vct.transform(text["clean_blob"]).toarray()

In [25]:
words_df = pd.DataFrame(X, columns=bow_vct.get_feature_names_out())
words_df.head()

Unnamed: 0,1000,1400,1450,1500,1600,1700,1750,1800,1830,1900,...,www,xenobiotics,xp,ya,yearbook,zen,zone,zoo,zoology,zoroastrianism
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
# from sklearn.cluster import KMeans

# km = KMeans(n_clusters=10)

# # train the model

# km.fit(words_df)

KMeans(n_clusters=10)

In [42]:
# pred = km.predict(words_df)

# pd.DataFrame(pred).to_csv('pred_categories.csv')

In [43]:
# predict_df = pd.concat([genre, pd.DataFrame(pred, columns=["class"])], axis=1)

# predict_df.to_csv("predicted_genre.csv")

In [44]:
predict_df["class"].value_counts()

6    2296
1     126
2      84
5      58
7      54
3      43
0      40
4      33
9      27
8      14
Name: class, dtype: int64

In [45]:
# explore category 0 == MEDICINE

predict_df[predict_df["class"]==0]

Unnamed: 0,category_id,category_name,class
11,1379,Accident & Emergency Medicine,0
214,1391,Aviation & Space Medicine,0
349,1339,Cardiovascular Medicine,0
411,2838,Chinese Medicine & Acupuncture,0
475,1324,Clinical & Internal Medicine,0
517,1456,Complementary Medicine,0
518,1475,Complementary Medicine For Animals,0
730,1392,Diving & Hyperbaric Medicine,0
888,1390,Environmental Medicine,0
897,1461,Equine Veterinary Medicine,0


In [46]:
# explore category 1 == LAW

predict_df[predict_df["class"]==1]

Unnamed: 0,category_id,category_name,class
14,1229,Accounting Law,1
39,1213,"Advertising, Marketing & Sponsorship Law",1
55,1172,Agency Law,1
59,1215,Agricultural Law,1
105,1216,Animal Law,1
...,...,...,...
2544,1200,Terrorism Law,1
2597,1241,Trademarks Law,1
2604,1136,Transnational Commercial Law,1
2607,1219,Transport Law,1


In [47]:
# explore category 2 == ????

predict_df[predict_df["class"]==2]

Unnamed: 0,category_id,category_name,class
21,2461,Activity Books,2
33,342,Adventure Books,2
34,2491,Adventure Books for Kids,2
124,2904,"Antiques & Collectables: Books, Manuscripts, E...",2
160,3,Art Books,2
...,...,...,...
2610,2967,Transportation Books,2
2611,2536,Transportation Books for Children,2
2632,234,True Story Books,2
2757,2687,World War 1 Books,2


In [48]:
# explore category 3 == ENGINEERING

predict_df[predict_df["class"]==3]

Unnamed: 0,category_id,category_name,class
17,1856,Acoustic & Sound Engineering,3
58,1878,Agricultural Engineering & Machinery,3
208,229,"Autobiography: Science, Technology & Engineering",3
210,1799,Automatic Control Engineering,3
262,1721,Biochemical Engineering,3
275,228,"Biography: Science, Technology & Engineering",3
280,1446,Biomedical Engineering,3
385,1727,Chemical Engineering,3
454,1810,"Civil Engineering, Surveying & Building",3
506,1802,Communications Engineering / Telecommunications,3


In [49]:
# explore category 4 == Medical

predict_df[predict_df["class"]==4]

Unnamed: 0,category_id,category_name,class
222,1614,Bacteriology (non-medical),4
895,1297,Epidemiology & Medical Statistics,4
1095,1600,Genetics (non-medical),4
1559,1279,Medical,4
1561,1301,Medical Administration & Management,4
1562,1650,Medical Anthropology,4
1563,1455,"Medical Charts, Colour Atlases",4
1564,1443,Medical Counselling,4
1565,1325,Medical Diagnosis,4
1566,1284,Medical Equipment & Techniques,4


In [50]:
# explore category 5 == SOCIAL SCIENCE

predict_df[predict_df["class"]==5]

Unnamed: 0,category_id,category_name,class
16,963,Accounting: Study & Revision Guides,5
232,976,Banking & Finance: Study & Revision Guides,5
250,3156,Bible Studies: For Individual Or Small Group S...,5
256,3152,Biblical Studies & Exegesis,5
289,692,Black & Asian Studies,5
324,979,Business & Management: Study & Revision Guides,5
333,2585,Business Studies,5
334,978,Business Studies: General,5
405,307,Children's Literature Studies: General,5
504,409,Communication Studies,5


In [51]:
# explore category 6 == ?????

predict_df[predict_df["class"]==6]

Unnamed: 0,category_id,category_name,class
0,1998,.Net Programming,6
1,176,20th Century & Contemporary Classical Music,6
2,3291,20th Century & Contemporary Classical Music,6
3,2659,20th Century History: C 1900 To C 2000,6
4,2661,21st Century History: From C 2000 -,6
...,...,...,...
2770,1634,Zoology: Invertebrates,6
2771,1644,Zoology: Mammals,6
2772,1639,Zoology: Vertebrates,6
2773,3007,Zoos & Wildlife Parks,6


In [52]:
# explore category 7 == INDUSTRY

predict_df[predict_df["class"]==7]

Unnamed: 0,category_id,category_name,class
38,1089,Advertising Industry,7
46,1063,Aerospace & Air Transport Industries,7
62,1035,Agriculture & Related Industries,7
77,1045,Alternative & Renewable Energy Industries,7
151,1054,Armaments Industries,7
202,217,Autobiography: Business & Industry,7
216,1058,Aviation Manufacturing Industry,7
269,216,Biography: Business & Industry,7
284,1053,Biotechnology Industries,7
386,1048,Chemical Industries,7


In [53]:
# explore category 8 == CRIME

predict_df[predict_df["class"]==8]

Unnamed: 0,category_id,category_name,class
365,737,Causes & Prevention Of Crime,8
596,742,Corporate Crime,8
616,336,Crime,8
617,736,Crime & Criminology,8
618,2617,Crime Fiction,8
1195,338,Historical Crime,8
1196,2619,Historical Crime,8
1820,743,Organized Crime,8
2420,741,Street Crime / Gun Crime,8
2627,235,True Crime Biographies,8


In [54]:
# explore category 9 == BUSINESS & ECONOMICS

predict_df[predict_df["class"]==9]

Unnamed: 0,category_id,category_name,class
323,977,Business & Management,9
325,1917,Business Applications,9
326,1005,Business Communication & Presentation,9
327,983,Business Competition,9
328,984,Business Ethics,9
329,981,Business Innovation,9
330,1006,Business Mathematics & Systems,9
331,1004,Business Negotiation,9
332,980,Business Strategy,9
335,3102,Business Travel,9


### NLP for categoreies 2 and 6

In [61]:
genre2 = predict_df[(predict_df["class"]==2) | (predict_df["class"]==6)]

In [63]:
genre2["class"].value_counts()

6    2296
2      84
Name: class, dtype: int64

In [64]:
text = pd.DataFrame(genre2["category_name"], columns=["category_name"])
text

Unnamed: 0,category_name
0,.Net Programming
1,20th Century & Contemporary Classical Music
2,20th Century & Contemporary Classical Music
3,20th Century History: C 1900 To C 2000
4,21st Century History: From C 2000 -
...,...
2770,Zoology: Invertebrates
2771,Zoology: Mammals
2772,Zoology: Vertebrates
2773,Zoos & Wildlife Parks


In [65]:
def tokenizer_and_remove_punctuation(row):

    tokens = word_tokenize(row["category_name"])

    return [token.lower() for token in tokens]

text["tokenized"] =  text.apply(tokenizer_and_remove_punctuation, axis=1)

In [67]:
def get_wordnet_pos(token):

    tag = nltk.pos_tag([token], lang="eng")[0][1][0].upper()
    tag_dict = {"N": wordnet.NOUN,
                "J": wordnet.ADJ,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

In [68]:
lm = WordNetLemmatizer()

def lemmatizer_with_pos(row):

    return [lm.lemmatize(token, get_wordnet_pos(token)) for token in row["tokenized"]]


text["lemmatized"] = text.apply(lemmatizer_with_pos, axis=1)

In [69]:

def remove_sw(row):
    return list(set(row["lemmatized"]).difference(stopwords.words()))

text["no_stopwords"] = text.apply(remove_sw, axis=1)

In [71]:

def re_blob(row):
    return " ".join(row["no_stopwords"])

text["clean_blob"] =  text.apply(re_blob, axis = 1)

In [73]:
bow_vct = CountVectorizer()

# train it

bow_vct.fit(text["clean_blob"])

CountVectorizer()

In [74]:
X = bow_vct.transform(text["clean_blob"]).toarray()

In [75]:
words2_df = pd.DataFrame(X, columns=bow_vct.get_feature_names_out())
words2_df.head()

Unnamed: 0,1000,1400,1450,1500,1600,1700,1750,1800,1830,1900,...,ww2,xenobiotics,xp,ya,yearbook,zen,zone,zoo,zoology,zoroastrianism
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [76]:
# from sklearn.cluster import KMeans

# km = KMeans(n_clusters=10)

# # train the model

# km.fit(words2_df)

KMeans(n_clusters=10)

In [84]:
# pred2 = km.predict(words2_df)

#pd.DataFrame(pred2).to_csv('pred_categories2.csv')

In [89]:
len(pred2)

2380

In [113]:
genre2 = genre2.reset_index()

In [114]:
# predict2_df = pd.concat([genre2, pd.DataFrame(pred2, columns=["class2"])], axis=1) #ignore_index=True

#predict2_df.to_csv("predicted_genre2.csv")

In [116]:
predict2_df["class2"].value_counts()

1    2068
2      70
7      62
4      58
6      32
3      23
8      23
0      16
9      16
5      12
Name: class2, dtype: int64

In [118]:
# explore category  0 == Kids

predict2_df[predict2_df["class2"]==0]

Unnamed: 0,index,category_id,category_name,class,class2
30,34,2491,Adventure Books for Kids,2,0
602,699,2590,Dictionaries for Kids,6,0
816,954,2496,Fantasy Books for Kids,2,0
903,1052,2501,Funny Books for Kids,2,0
1019,1185,2575,Hindu Books for Kids,2,0
1036,1205,2564,History Books for Kids,2,0
1166,1378,2574,Islamic Books for Kids,2,0
1194,1411,2579,Kids’ Chemistry Books,2,0
1195,1412,2573,Kids’ Jewish Books,2,0
1402,1685,2555,Musical Story & Educational Books For Kids,2,0


In [120]:
# explore category 1 == Other

predict2_df[predict2_df["class2"]==1]

Unnamed: 0,index,category_id,category_name,class,class2
3,3,2659,20th Century History: C 1900 To C 2000,6,1
4,4,2661,21st Century History: From C 2000 -,6,1
6,6,1943,3D Graphics & Modelling,6,1
8,8,2472,ABC,6,1
9,9,768,Abnormal Psychology,6,1
...,...,...,...,...,...
2375,2770,1634,Zoology: Invertebrates,6,1
2376,2771,1644,Zoology: Mammals,6,1
2377,2772,1639,Zoology: Vertebrates,6,1
2378,2773,3007,Zoos & Wildlife Parks,6,1


In [121]:
# explore category  2 == Other

predict2_df[predict2_df["class2"]==2]

Unnamed: 0,index,category_id,category_name,class,class2
17,21,2461,Activity Books,2,2
29,33,342,Adventure Books,2,2
111,124,2904,"Antiques & Collectables: Books, Manuscripts, E...",2,2
146,160,3,Art Books,2,2
199,220,2458,Baby Books,2,2
...,...,...,...,...,...
2231,2610,2967,Transportation Books,2,2
2232,2611,2536,Transportation Books for Children,2,2
2249,2632,234,True Story Books,2,2
2362,2757,2687,World War 1 Books,2,2


In [122]:
# explore category  3 == Programming

predict2_df[predict2_df["class2"]==3]

Unnamed: 0,index,category_id,category_name,class,class2
0,0,1998,.Net Programming,6,3
5,5,1992,2D Graphics: Games Programming,6,3
7,7,1993,3D Graphics: Games Programming,6,3
48,56,1987,Agile Programming,6,3
166,180,1988,Aspect Programming / AOP,6,3
462,535,1983,Computer Programming / Software Development,6,3
570,663,2003,Database Programming,6,3
804,937,1989,Extreme Programming,6,3
899,1048,1990,Functional Programming,6,3
910,1059,1991,Games Development & Programming,6,3


In [123]:
# explore category  4 == Music

predict2_df[predict2_df["class2"]==4]

Unnamed: 0,index,category_id,category_name,class,class2
1,1,176,20th Century & Contemporary Classical Music,6,4
2,2,3291,20th Century & Contemporary Classical Music,6,4
69,81,197,Ambient & New Age Music,6,4
70,82,3312,Ambient & New Age Music,6,4
212,236,173,Baroque Music (c 1600 To C 1750),6,4
213,237,3288,Baroque Music (c 1600 To C 1750),6,4
272,304,182,"Brass Band, Military Music & Marches",6,4
273,305,3297,"Brass Band, Military Music & Marches",6,4
359,414,177,Choral Music,6,4
360,415,3292,Choral Music,6,4


In [124]:
# explore category 5 == Strategy

predict2_df[predict2_df["class2"]==5]

Unnamed: 0,index,category_id,category_name,class,class2
287,319,793,Bullying & Anti-bullying Strategies,6,5
288,320,3338,Bullying & Anti-bullying Strategies,6,5
582,676,902,"Defence Strategy, Planning & Research",6,5
716,825,787,Educational Strategies & Policy,6,5
717,826,3332,Educational Strategies & Policy,6,5
911,1060,1916,Games Strategy Guides,6,5
1251,1481,788,Literacy Strategies,6,5
1252,1482,3333,Literacy Strategies,6,5
1470,1759,789,Numeracy Strategies,6,5
1471,1760,3334,Numeracy Strategies,6,5


In [125]:
# explore category  6 == Teaching

predict2_df[predict2_df["class2"]==6]

Unnamed: 0,index,category_id,category_name,class,class2
669,775,310,ELT: Teaching Theory & Methods,6,6
1217,1437,271,Language Teaching & Learning (other Than ELT),6,6
1218,1438,273,Language Teaching & Learning Material & Course...,6,6
1219,1439,272,Language Teaching Theory & Methods,6,6
2125,2492,834,Teaching Of A Specific Subject,6,6
2126,2493,3379,Teaching Of A Specific Subject,6,6
2127,2494,829,Teaching Of Autistic Students,6,6
2128,2495,3374,Teaching Of Autistic Students,6,6
2129,2496,828,Teaching Of Dyslexic Students,6,6
2130,2497,3373,Teaching Of Dyslexic Students,6,6


In [126]:
# explore category 7 == Arts

predict2_df[predict2_df["class2"]==7]

Unnamed: 0,index,category_id,category_name,class,class2
99,112,76,"Animals & Nature In Art (still Life, Landscape...",6,7
142,156,2509,Art,6,7
143,157,2554,Art & Design,6,7
144,158,41,Art & Design Styles: Postmodernism,6,7
145,159,2,Art & Photography,6,7
...,...,...,...,...,...
2072,2437,26,Styles: Art & Craft,6,7
2073,2438,33,Styles: Art Deco,6,7
2074,2439,28,Styles: Art Nouveau,6,7
2077,2442,40,Styles: Conceptual Art,6,7


In [127]:
# explore category 8 == Fiction

predict2_df[predict2_df["class2"]==8]

Unnamed: 0,index,category_id,category_name,class,class2
344,397,2488,Children's Fiction,6,8
405,465,353,Classic Science Fiction,6,8
406,466,2627,Classic Science Fiction,6,8
488,568,334,Contemporary Fiction,6,8
656,760,318,ELT Literature & Fiction Readers,6,8
773,903,356,Erotic Fiction,6,8
830,969,373,Fiction Companions,6,8
831,970,371,Fiction In Translation,6,8
832,971,2561,Fiction Texts,6,8
833,972,372,Fiction-related Items,6,8


In [128]:
# explore category 9 == Trade

predict2_df[predict2_df["class2"]==9]

Unnamed: 0,index,category_id,category_name,class,class2
140,154,921,Arms Trade,6,9
195,213,1843,Automotive Technology & Trades,6,9
284,316,1828,Building Skills & Trades,6,9
635,737,934,Domestic Trade,6,9
636,738,2748,Domestic Trade,6,9
646,748,740,Drugs Trade / Drug Trafficking,6,9
1064,1236,1872,Hotel & Catering Trades,6,9
1288,1530,1851,Maritime / Nautical Trades,6,9
1541,1838,1871,Other Vocational Technologies & Trades,6,9
1784,2112,1847,Railway Trades,6,9


In [129]:
predict2_df

Unnamed: 0,index,category_id,category_name,class,class2
0,0,1998,.Net Programming,6,3
1,1,176,20th Century & Contemporary Classical Music,6,4
2,2,3291,20th Century & Contemporary Classical Music,6,4
3,3,2659,20th Century History: C 1900 To C 2000,6,1
4,4,2661,21st Century History: From C 2000 -,6,1
...,...,...,...,...,...
2375,2770,1634,Zoology: Invertebrates,6,1
2376,2771,1644,Zoology: Mammals,6,1
2377,2772,1639,Zoology: Vertebrates,6,1
2378,2773,3007,Zoos & Wildlife Parks,6,1


In [130]:
predict_df

Unnamed: 0,category_id,category_name,class
0,1998,.Net Programming,6
1,176,20th Century & Contemporary Classical Music,6
2,3291,20th Century & Contemporary Classical Music,6
3,2659,20th Century History: C 1900 To C 2000,6
4,2661,21st Century History: From C 2000 -,6
...,...,...,...
2770,1634,Zoology: Invertebrates,6
2771,1644,Zoology: Mammals,6
2772,1639,Zoology: Vertebrates,6
2773,3007,Zoos & Wildlife Parks,6


In [144]:
genre_final = predict_df.merge(predict2_df,how='left', left_on='category_id', right_on='category_id')

In [145]:
genre_final.head()

Unnamed: 0,category_id,category_name_x,class_x,index,category_name_y,class_y,class2
0,1998,.Net Programming,6,0.0,.Net Programming,6.0,3.0
1,176,20th Century & Contemporary Classical Music,6,1.0,20th Century & Contemporary Classical Music,6.0,4.0
2,3291,20th Century & Contemporary Classical Music,6,2.0,20th Century & Contemporary Classical Music,6.0,4.0
3,2659,20th Century History: C 1900 To C 2000,6,3.0,20th Century History: C 1900 To C 2000,6.0,1.0
4,2661,21st Century History: From C 2000 -,6,4.0,21st Century History: From C 2000 -,6.0,1.0


In [148]:
drop_columns = ["category_name_y", "class_y", "index"]

genre_final = genre_final.drop(columns=drop_columns)


In [149]:
genre_final.rename(columns={"category_name_x":"category_name", "class_x": "class"}, inplace=True)

In [150]:
genre_final.isna().sum()

category_id        0
category_name      0
class              0
class2           395
dtype: int64

In [151]:
genre_final

Unnamed: 0,category_id,category_name,class,class2
0,1998,.Net Programming,6,3.0
1,176,20th Century & Contemporary Classical Music,6,4.0
2,3291,20th Century & Contemporary Classical Music,6,4.0
3,2659,20th Century History: C 1900 To C 2000,6,1.0
4,2661,21st Century History: From C 2000 -,6,1.0
...,...,...,...,...
2770,1634,Zoology: Invertebrates,6,1.0
2771,1644,Zoology: Mammals,6,1.0
2772,1639,Zoology: Vertebrates,6,1.0
2773,3007,Zoos & Wildlife Parks,6,1.0


In [153]:
#conditions = [
    # (df['likes_count'] <= 2),
    # (df['likes_count'] > 2) & (df['likes_count'] <= 9),
    # (df['likes_count'] > 9) & (df['likes_count'] <= 15),
    # (df['likes_count'] > 15)
    # ]

In [169]:
genre_final.loc[genre_final["class"]==0,"genre"] = "Medicine"
genre_final.loc[genre_final["class"]==4,"genre"] = "Medicine"
genre_final.loc[genre_final["class"]==1,"genre"] = "Law & Crime"
genre_final.loc[genre_final["class"]==8,"genre"] = "Law & Crime"
genre_final.loc[genre_final["class"]==3,"genre"] = "Engineering & Programming"
genre_final.loc[genre_final["class2"]==3, "genre"] = "Engineering & Programming"
genre_final.loc[genre_final["class"]==5,"genre"] = "Social Science and Teaching"
genre_final.loc[genre_final["class2"]==6,"genre"] = "Social Science and Teaching"
genre_final.loc[genre_final["class"]==7,"genre"] = "Business, Economics & Industry"
genre_final.loc[genre_final["class"]==9,"genre"] = "Business, Economics & Industry"
genre_final.loc[genre_final["class2"]==9,"genre"] = "Business, Economics & Industry"
genre_final.loc[genre_final["class2"]==5,"genre"] = "Business, Economics & Industry"
genre_final.loc[genre_final["class2"]==4,"genre"] = "Arts & Music"
genre_final.loc[genre_final["class2"]==7,"genre"] = "Arts & Music"
genre_final.loc[genre_final["class2"]==2,"genre"] = "Other"
genre_final.loc[genre_final["class2"]==1,"genre"] = "Other"
genre_final.loc[genre_final["class2"]==0,"genre"] = "Kids & Fiction"
genre_final.loc[genre_final["class2"]==0,"genre"] = "Kids & Fiction"


In [170]:
genre_final

Unnamed: 0,category_id,category_name,class,class2,genre
0,1998,.Net Programming,6,3.0,Engineering & Programming
1,176,20th Century & Contemporary Classical Music,6,4.0,Arts & Music
2,3291,20th Century & Contemporary Classical Music,6,4.0,Arts & Music
3,2659,20th Century History: C 1900 To C 2000,6,1.0,Other
4,2661,21st Century History: From C 2000 -,6,1.0,Other
...,...,...,...,...,...
2770,1634,Zoology: Invertebrates,6,1.0,Other
2771,1644,Zoology: Mammals,6,1.0,Other
2772,1639,Zoology: Vertebrates,6,1.0,Other
2773,3007,Zoos & Wildlife Parks,6,1.0,Other


In [171]:
genre_final.to_csv("genres.csv")