In [1]:
import numpy as np
import pandas as pd
from collections import Counter
from matplotlib import pyplot as plt

import warnings
warnings.filterwarnings('ignore')

**Importing Data**

In [2]:
train = pd.read_excel('Data_Train.xlsx')
test = pd.read_excel('Data_Test.xlsx')
train.shape

(6237, 9)

In [3]:
test.shape

(1560, 8)

In [4]:
train.BookCategory.value_counts()

Action & Adventure                      818
Crime, Thriller & Mystery               723
Biographies, Diaries & True Accounts    596
Language, Linguistics & Writing         594
Comics & Mangas                         583
Romance                                 560
Humour                                  540
Arts, Film & Photography                517
Computing, Internet & Digital Media     510
Sports                                  471
Politics                                325
Name: BookCategory, dtype: int64

In [5]:
train.Author.sort_values()

2146       0, Butterfield, Ngondi, Kerr
3743    0, Jonathan Law, Richard Rennie
986                     0, Kerr, Wright
2075                     0, Rennie, Law
3296                          0, Speake
                     ...               
4724                Zygmunt Miloszewski
2850                              dodie
747                            r.h. Sin
5083                renu and neena kaul
5662                       sister Jesme
Name: Author, Length: 6237, dtype: object

In [6]:
train.Reviews[1].split()[0]

'3.9'

In [7]:
train.head()

Unnamed: 0,Title,Author,Edition,Reviews,Ratings,Synopsis,Genre,BookCategory,Price
0,The Prisoner's Gold (The Hunters 3),Chris Kuzneski,"Paperback,– 10 Mar 2016",4.0 out of 5 stars,8 customer reviews,THE HUNTERS return in their third brilliant no...,Action & Adventure (Books),Action & Adventure,220.0
1,Guru Dutt: A Tragedy in Three Acts,Arun Khopkar,"Paperback,– 7 Nov 2012",3.9 out of 5 stars,14 customer reviews,A layered portrait of a troubled genius for wh...,Cinema & Broadcast (Books),"Biographies, Diaries & True Accounts",202.93
2,Leviathan (Penguin Classics),Thomas Hobbes,"Paperback,– 25 Feb 1982",4.8 out of 5 stars,6 customer reviews,"""During the time men live without a common Pow...",International Relations,Humour,299.0
3,A Pocket Full of Rye (Miss Marple),Agatha Christie,"Paperback,– 5 Oct 2017",4.1 out of 5 stars,13 customer reviews,A handful of grain is found in the pocket of a...,Contemporary Fiction (Books),"Crime, Thriller & Mystery",180.0
4,LIFE 70 Years of Extraordinary Photography,Editors of Life,"Hardcover,– 10 Oct 2006",5.0 out of 5 stars,1 customer review,"For seven decades, ""Life"" has been thrilling t...",Photography Textbooks,"Arts, Film & Photography",965.62


**Combining Dataset(Train + Test)** - _for cleaning and feature engineering_

In [8]:
train.Price.isnull().sum()

0

In [9]:
combined = pd.concat([train, test], sort=False)
combined.reset_index(drop=True, inplace=True)
combined.tail()

Unnamed: 0,Title,Author,Edition,Reviews,Ratings,Synopsis,Genre,BookCategory,Price
7792,100 Things Every Designer Needs to Know About ...,Susan Weinschenk,"Paperback,– 14 Apr 2011",5.0 out of 5 stars,4 customer reviews,We design to elicit responses from people. We ...,Design,"Computing, Internet & Digital Media",
7793,"Modern Letter Writing Course: Personal, Busine...",ARUN SAGAR,"Paperback,– 8 May 2013",3.6 out of 5 stars,13 customer reviews,"A 30-day course to write simple, sharp and att...",Children's Reference (Books),"Biographies, Diaries & True Accounts",
7794,The Kite Runner Graphic Novel,Khaled Hosseini,"Paperback,– 6 Sep 2011",4.0 out of 5 stars,5 customer reviews,The perennial bestseller-now available as a se...,Humour (Books),Humour,
7795,Panzer Leader (Penguin World War II Collection),Heinz Guderian,"Paperback,– 22 Sep 2009",3.5 out of 5 stars,3 customer reviews,Heinz Guderian - master of the Blitzkrieg and ...,United States History,"Biographies, Diaries & True Accounts",
7796,Complete Spanish Step-by-Step,Barbara Bregstein,"Paperback,– 16 Sep 2016",4.5 out of 5 stars,2 customer reviews,Learn Spanish with the most convenient and eff...,Dictionaries,"Language, Linguistics & Writing",


In [10]:
combined.Author.value_counts().count()

4372

In [11]:
pd.DataFrame(combined.Author.value_counts()).sort_index()

Unnamed: 0,Author
"0, Butterfield, Ngondi, Kerr",1
"0, Jonathan Law",1
"0, Jonathan Law, Richard Rennie",1
"0, Kerr, Wright",1
"0, Rennie, Law",1
...,...
Zygmunt Miloszewski,1
dodie,1
r.h. Sin,2
renu and neena kaul,1


In [12]:
from collections import Counter
Counter(combined.Author).most_common()

[('Agatha Christie', 87),
 ('Ladybird', 63),
 ('DK', 61),
 ('Albert Uderzo', 48),
 ('Herge', 40),
 ('Nora Roberts', 40),
 ('James Patterson', 39),
 ('Bill Watterson', 36),
 ('John Grisham', 34),
 ('Sidney Sheldon', 33),
 ('P.G. Wodehouse', 33),
 ('Clive Cussler', 30),
 ('Sophie Kinsella', 27),
 ('David Baldacci', 26),
 ('Wilbur Smith', 26),
 ('Stephen King', 26),
 ('Danielle Steel', 25),
 ('Lee Child', 25),
 ('George R.R. Martin', 23),
 ('Jeffrey Archer', 22),
 ("Louis L'Amour", 21),
 ('Frederick Forsyth', 21),
 ('Oliver Bowden', 20),
 ('Dreamland Publications', 20),
 ('Michael Crichton', 19),
 ('Matthew Reilly', 19),
 ('Akira Toriyama', 19),
 ('Ruskin Bond', 18),
 ('Geronimo Stilton', 17),
 ('René Goscinny, Albert Uderzo', 17),
 ('Neil Gaiman', 17),
 ('Various', 16),
 ('Robert Ludlum', 16),
 ('Alistair MacLean', 15),
 ('Dan Brown', 15),
 ('Oxford Dictionaries', 15),
 ('James Rollins', 15),
 ('Ken Follett', 15),
 ('Daniel Silva', 14),
 ('Haruki Murakami', 14),
 ('Trinity College Lond',

In [13]:
combined.shape

(7797, 9)

# Feature Cleaning & Extraction

In [14]:
combined['Title'] = combined['Title'].str.lower()


**Splitting Edition** - *to Edition Binding type and other feature*

In [15]:
combined.Edition.value_counts()

Paperback,– 5 Oct 2017     60
Paperback,– 2016           58
Paperback,– 2017           47
Paperback,– 2019           37
Paperback,– 2013           35
                           ..
Hardcover,– 9 Dec 2014      1
Paperback,– 11 Aug 2003     1
Paperback,– 1 Mar 2000      1
Paperback,– 2 Mar 1999      1
Paperback,– 28 Mar 2001     1
Name: Edition, Length: 3882, dtype: int64

**Binning Edition Binding** - *combined edition binding ( with occurence < 9 --> "other" )*

In [16]:
Counter(combined.Edition).most_common()

[('Paperback,– 5 Oct 2017', 60),
 ('Paperback,– 2016', 58),
 ('Paperback,– 2017', 47),
 ('Paperback,– 2019', 37),
 ('Paperback,– 2013', 35),
 ('Paperback,– 1 Jan 2013', 30),
 ('Paperback,– 2012', 25),
 ('Paperback,– 2015', 24),
 ('Hardcover,– 2 Aug 2009', 23),
 ('Paperback,– 2014', 23),
 ('Paperback,– 2018', 22),
 ('Paperback,– 14 Oct 2000', 20),
 ('Paperback,– 2010', 20),
 ('Paperback,– 2011', 19),
 ('Paperback,– 1 Apr 2019', 18),
 ('Paperback,– 1 Sep 2011', 17),
 ('Paperback,– 5 Sep 2005', 16),
 ('Paperback,– 25 Apr 2019', 16),
 ('Paperback,– 30 Oct 2017', 14),
 ('Paperback,– 2 Aug 2012', 13),
 ('Paperback,– 4 Oct 2016', 12),
 ('Paperback,– 30 Jun 2015', 12),
 ('Paperback,– 23 Mar 2017', 12),
 ('Paperback,– 2008', 12),
 ('Paperback,– 1 Jan 2009', 11),
 ('Paperback,– Import, 14 Dec 2017', 11),
 ('Paperback,– 7 Apr 2011', 11),
 ('Paperback,– 7 Oct 2004', 11),
 ('Paperback,– 29 May 2018', 11),
 ('Paperback,– 6 Sep 2016', 10),
 ('Paperback,– 27 Aug 2013', 10),
 ('Paperback,– 20 Apr 2016'

In [17]:
for ed in combined['Edition']:
    if ed.find("Hardcover")!=-1: print(ed)
#train['Edition'][3].split(',')

Hardcover,– 10 Oct 2006
Hardcover,– Import, 1 Mar 2018
Hardcover,– 8 Mar 2018
Hardcover,– 24 Nov 2018
Hardcover,– Deckle Edge, 18 Oct 2011
Hardcover,– 15 Sep 2014
Hardcover,– 10 May 2016
Hardcover,– 28 Sep 2013
Hardcover,– 8 Jul 2015
Hardcover,– 2019
Hardcover,– 4 Oct 2016
Hardcover,– 25 Dec 2016
Hardcover,– 2 Mar 1999
Hardcover,– 1 Sep 2016
Hardcover,– 21 Jan 2019
Hardcover,– 20 Aug 2018
Hardcover,– Import, 5 Jul 2018
Hardcover,– Import, 25 Mar 2019
Hardcover,– 5 Nov 2015
Hardcover,– 19 May 2019
Hardcover,– 4 Sep 2014
Hardcover,– 1 Sep 2016
Hardcover,– 16 Jan 2016
Hardcover,– 25 Aug 2002
Hardcover,– 1 Apr 2009
Hardcover,– 17 Dec 2018
Hardcover,– 8 Jul 2016
Hardcover,– 7 Sep 2006
Hardcover,– Import, 16 Mar 2018
Hardcover,– 26 Apr 2011
Hardcover,– 22 Jul 2018
Hardcover,– 18 Aug 2009
Hardcover,– 2 Aug 2009
Hardcover,– 14 Jun 2018
Hardcover,– 2016
Hardcover,– 21 Feb 2019
Hardcover,– 26 Sep 2017
Hardcover,– Illustrated, 24 Sep 2014
Hardcover,– 24 Dec 2018
Hardcover,– 19 Nov 2013
Hardcover,

In [18]:
for ed in combined['Edition']:
    if ed.find("Audiobook")!=-1: print(ed)
#train['Edition'][3].split(',')

Paperback,– Audiobook, 28 Feb 2003
Paperback,– Abridged, Audiobook, Box set
Hardcover,– Abridged, Audiobook, Box set
Hardcover,– Audiobook, Unabridged
Paperback,– Abridged, Audiobook, Large Print
Paperback,– Abridged, Audiobook, Box set
Paperback,– Abridged, Audiobook, Box set
Paperback,– Illustrated, Large Print, Audiobook
Paperback,– Audiobook, 16 Oct 2014
Paperback,– Abridged, Audiobook, Box set
Paperback,– Audiobook, 10 Aug 2018
Paperback,– Audiobook, Illustrated, Large Print
Paperback,– Audiobook, Box set, Large Print
Paperback,– Audiobook, 16 Jun 2016
Paperback,– Abridged, Audiobook, Large Print
Hardcover,– Abridged, Audiobook, Box set
Paperback,– Audiobook, 16 Jun 2016
Paperback,– Abridged, Audiobook, Box set


In [19]:
combined[['EditionBinding','EditionType1']] = combined['Edition'].str.split(',– ',expand=True)
combined.head(2)

Unnamed: 0,Title,Author,Edition,Reviews,Ratings,Synopsis,Genre,BookCategory,Price,EditionBinding,EditionType1
0,the prisoner's gold (the hunters 3),Chris Kuzneski,"Paperback,– 10 Mar 2016",4.0 out of 5 stars,8 customer reviews,THE HUNTERS return in their third brilliant no...,Action & Adventure (Books),Action & Adventure,220.0,Paperback,10 Mar 2016
1,guru dutt: a tragedy in three acts,Arun Khopkar,"Paperback,– 7 Nov 2012",3.9 out of 5 stars,14 customer reviews,A layered portrait of a troubled genius for wh...,Cinema & Broadcast (Books),"Biographies, Diaries & True Accounts",202.93,Paperback,7 Nov 2012


In [20]:
combined['EditionBinding'].value_counts()

Paperback                6458
Hardcover                1056
Mass Market Paperback     200
Sheet music                32
Flexibound                 19
Cards                       9
Spiral-bound                6
Loose Leaf                  3
Tankobon Softcover          3
(German),Paperback          1
Board book                  1
(Kannada),Paperback         1
(French),Paperback          1
(Spanish),Paperback         1
Perfect Paperback           1
Plastic Comb                1
Library Binding             1
Leather Bound               1
Product Bundle              1
(Chinese),Paperback         1
Name: EditionBinding, dtype: int64

In [21]:
edition_binding_dict = combined['EditionBinding'].value_counts().to_dict()

edition_binding_dict

{'Paperback': 6458,
 'Hardcover': 1056,
 'Mass Market Paperback': 200,
 'Sheet music': 32,
 'Flexibound': 19,
 'Cards': 9,
 'Spiral-bound': 6,
 'Loose Leaf': 3,
 'Tankobon Softcover': 3,
 '(German),Paperback': 1,
 'Board book': 1,
 '(Kannada),Paperback': 1,
 '(French),Paperback': 1,
 '(Spanish),Paperback': 1,
 'Perfect Paperback': 1,
 'Plastic Comb': 1,
 'Library Binding': 1,
 'Leather Bound': 1,
 'Product Bundle': 1,
 '(Chinese),Paperback': 1}

In [22]:
combined['EditionBinding'] = combined['EditionBinding'].apply(lambda x: (x if edition_binding_dict[x] > 9 else 'other'))


In [23]:
combined['EditionBinding'].value_counts()

Paperback                6458
Hardcover                1056
Mass Market Paperback     200
other                      32
Sheet music                32
Flexibound                 19
Name: EditionBinding, dtype: int64

**Splitting Edition remainder part** - *extracting edition date and edition type*

In [24]:
Counter(combined.EditionType1).most_common()

[('5 Oct 2017', 60),
 ('2016', 60),
 ('2017', 50),
 ('2019', 40),
 ('2013', 37),
 ('2015', 35),
 ('1 Jan 2013', 31),
 ('2012', 25),
 ('2014', 25),
 ('2011', 24),
 ('2 Aug 2009', 23),
 ('2010', 23),
 ('2018', 22),
 ('25 Apr 2019', 21),
 ('14 Oct 2000', 20),
 ('1 Sep 2011', 19),
 ('1 Apr 2019', 18),
 ('4 Oct 2016', 17),
 ('5 Sep 2005', 16),
 ('30 Oct 2017', 14),
 ('27 Aug 2013', 14),
 ('1 Jan 2009', 13),
 ('15 Sep 2015', 13),
 ('30 Jun 2015', 13),
 ('6 Jun 2017', 13),
 ('18 Oct 2016', 13),
 ('2008', 13),
 ('2 Aug 2012', 13),
 ('1 Jul 2017', 13),
 ('1 Sep 2016', 12),
 ('6 Sep 2016', 12),
 ('25 Oct 2016', 12),
 ('23 Mar 2017', 12),
 ('7 Apr 2011', 12),
 ('1 Mar 2016', 11),
 ('10 Jul 2018', 11),
 ('1 Sep 2015', 11),
 ('3 Nov 2015', 11),
 ('Import, 14 Dec 2017', 11),
 ('3 Nov 2016', 11),
 ('7 Oct 2004', 11),
 ('2009', 11),
 ('29 May 2018', 11),
 ('26 Mar 2013', 10),
 ('28 Oct 2014', 10),
 ('14 Jun 2018', 10),
 ('10 Oct 2017', 10),
 ('13 Oct 2015', 10),
 ('20 Apr 2016', 10),
 ('28 Jun 2018', 

In [25]:
def split_edition_1(x):
    j_arr = []
    date = ''

    for j in x.split(', '):
        if not any(k.isnumeric() for k in j):
            j_arr.append(j.strip())
        else:
            date = j

    if ''.join(j_arr) != '':
        ed = ', '.join(j_arr)
    else:
        ed = 'other'

    if ed != 'Import' and ed != 'Illustrated' and ed \
        != 'Special Edition' and ed != 'Unabridged' and ed \
        != 'Student Edition' and ed != 'Box set' and ed \
        != 'International Edition' and ed != 'Abridged':
        ed_ret = 'other'
    else:
        ed_ret = ed

    return (ed_ret, date)

In [26]:
combined['EditionType'],combined['EditionDate'] = \
    zip(*combined['EditionType1'].apply(split_edition_1))

In [27]:
combined.head(2)

Unnamed: 0,Title,Author,Edition,Reviews,Ratings,Synopsis,Genre,BookCategory,Price,EditionBinding,EditionType1,EditionType,EditionDate
0,the prisoner's gold (the hunters 3),Chris Kuzneski,"Paperback,– 10 Mar 2016",4.0 out of 5 stars,8 customer reviews,THE HUNTERS return in their third brilliant no...,Action & Adventure (Books),Action & Adventure,220.0,Paperback,10 Mar 2016,other,10 Mar 2016
1,guru dutt: a tragedy in three acts,Arun Khopkar,"Paperback,– 7 Nov 2012",3.9 out of 5 stars,14 customer reviews,A layered portrait of a troubled genius for wh...,Cinema & Broadcast (Books),"Biographies, Diaries & True Accounts",202.93,Paperback,7 Nov 2012,other,7 Nov 2012


**Splitting Edition date** - *extracting Month & Year*

In [28]:
def split_edition_date(x):
    (mon, year) = ('', '')
    if len(x.split()) == 1:
        year = int(x)
    elif len(x.split()) == 2:
        mon = x.split()[0]
        year = int(x.split()[1])
    elif len(x.split()) == 3:
        mon = x.split()[1]
        year = int(x.split()[2])
    return (mon, year)

In [29]:
combined['EditionMon'], combined['EditionYear'] = \
    zip(*combined['EditionDate'].apply(split_edition_date))

In [30]:
combined.head(2)

Unnamed: 0,Title,Author,Edition,Reviews,Ratings,Synopsis,Genre,BookCategory,Price,EditionBinding,EditionType1,EditionType,EditionDate,EditionMon,EditionYear
0,the prisoner's gold (the hunters 3),Chris Kuzneski,"Paperback,– 10 Mar 2016",4.0 out of 5 stars,8 customer reviews,THE HUNTERS return in their third brilliant no...,Action & Adventure (Books),Action & Adventure,220.0,Paperback,10 Mar 2016,other,10 Mar 2016,Mar,2016
1,guru dutt: a tragedy in three acts,Arun Khopkar,"Paperback,– 7 Nov 2012",3.9 out of 5 stars,14 customer reviews,A layered portrait of a troubled genius for wh...,Cinema & Broadcast (Books),"Biographies, Diaries & True Accounts",202.93,Paperback,7 Nov 2012,other,7 Nov 2012,Nov,2012


**Binning Month** - *combining quaterly*

In [31]:
def bin_edition_mon(x):
    x = x.lower()
    if x == 'jan' or x == 'feb' or x == 'mar':
        return 'first'
    elif x == 'apr' or x == 'may' or x == 'jun':
        return 'second'
    elif x == 'jul' or x == 'aug' or x == 'sep':
        return 'third'
    elif x == '':
        return ''
    else:
        return 'fourth'

*making columns to mark null values*

In [32]:
combined['EditionMon'] = combined['EditionMon'].apply(bin_edition_mon)
combined['Mon_null'] = combined['EditionMon'].apply(lambda x: \
        ('not_null' if x != '' else 'null'))
combined['Year_null'] = combined['EditionYear'].apply(lambda x: \
        ('not_null' if x != '' else 'null'))

In [33]:
combined.head(2)

Unnamed: 0,Title,Author,Edition,Reviews,Ratings,Synopsis,Genre,BookCategory,Price,EditionBinding,EditionType1,EditionType,EditionDate,EditionMon,EditionYear,Mon_null,Year_null
0,the prisoner's gold (the hunters 3),Chris Kuzneski,"Paperback,– 10 Mar 2016",4.0 out of 5 stars,8 customer reviews,THE HUNTERS return in their third brilliant no...,Action & Adventure (Books),Action & Adventure,220.0,Paperback,10 Mar 2016,other,10 Mar 2016,first,2016,not_null,not_null
1,guru dutt: a tragedy in three acts,Arun Khopkar,"Paperback,– 7 Nov 2012",3.9 out of 5 stars,14 customer reviews,A layered portrait of a troubled genius for wh...,Cinema & Broadcast (Books),"Biographies, Diaries & True Accounts",202.93,Paperback,7 Nov 2012,other,7 Nov 2012,fourth,2012,not_null,not_null


**Imputing Month and Year** - *by most common values*

In [34]:
combined[combined.EditionMon=='']

Unnamed: 0,Title,Author,Edition,Reviews,Ratings,Synopsis,Genre,BookCategory,Price,EditionBinding,EditionType1,EditionType,EditionDate,EditionMon,EditionYear,Mon_null,Year_null
19,"introducing data science: big data, machine le...","Davy Cielen, Arno D.B. Meysman, Mohamed Ali","Paperback,– 2016",4.3 out of 5 stars,5 customer reviews,Introducing Data Science explains vital data s...,Artificial Intelligence,"Computing, Internet & Digital Media",352.0,Paperback,2016,other,2016,,2016,,not_null
35,hannibal rising,Thomas Harris,"Paperback,– 2019",4.3 out of 5 stars,8 customer reviews,_________________________ hannibal lecter wasn...,Contemporary Fiction (Books),"Crime, Thriller & Mystery",309.0,Paperback,2019,other,2019,,2019,,not_null
44,daisy jones and the six,Taylor Jenkins Reid,"Hardcover,– 2019",4.6 out of 5 stars,6 customer reviews,picked as < u> one to watch in 2019</u> by <th...,Music Books,Romance,560.0,Hardcover,2019,other,2019,,2019,,not_null
60,mastering vba for microsoft office 2016 (sybex),Richard Mansfield,"Paperback,– 2016",5.0 out of 5 stars,1 customer review,Mastering VBA for Microsoft Office 2016 helps ...,Programming Languages (Books),"Computing, Internet & Digital Media",743.0,Paperback,2016,other,2016,,2016,,not_null
98,sap hr personnel administration and recruitmen...,Agrawal P.K,"Paperback,– 2011",5.0 out of 5 stars,2 customer reviews,SAP is a great software. One needs to fully un...,Business Communication,"Computing, Internet & Digital Media",449.0,Paperback,2011,other,2011,,2011,,not_null
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7710,heidi - ladybird readers level 4,Ladybird,"Paperback,– 2018",4.0 out of 5 stars,2 customer reviews,Heidi went to stay with her grandfather in the...,Language Learning & Teaching (Books),"Language, Linguistics & Writing",,Paperback,2018,other,2018,,2018,,not_null
7763,a practical course in english pronunciation,Sethi,"Paperback,– 2004",4.8 out of 5 stars,5 customer reviews,English enjoys a preeminent position today amo...,Linguistics (Books),"Language, Linguistics & Writing",,Paperback,2004,other,2004,,2004,,not_null
7778,"cartooning, the professional step-by-step guid...","Ivan Hissey, Curtis Tappenden","Paperback,– Illustrated, Import",3.0 out of 5 stars,1 customer review,A comprehensive and practical guide to drawing...,"Handicrafts, Decorative Arts & Crafts (Books)","Computing, Internet & Digital Media",,Paperback,"Illustrated, Import",other,,,,,
7784,you don't know js: up & going,Kyle Simpson,"Paperback,– 2015",4.1 out of 5 stars,7 customer reviews,"Itís easy to learn parts of JavaScript, but mu...",Programming & Software Development (Books),"Computing, Internet & Digital Media",,Paperback,2015,other,2015,,2015,,not_null


In [35]:
combined['EditionMon'].replace('', combined['EditionMon'].mode()[0],
                               inplace=True)
combined['EditionYear'].replace('', combined['EditionYear'].mode()[0],
                                inplace=True)

In [36]:
combined[combined.EditionMon=='']

Unnamed: 0,Title,Author,Edition,Reviews,Ratings,Synopsis,Genre,BookCategory,Price,EditionBinding,EditionType1,EditionType,EditionDate,EditionMon,EditionYear,Mon_null,Year_null


**Extracting Reviews & Ratings** - *converting to numerical data*

In [37]:
Counter(combined.Reviews).most_common()

[('5.0 out of 5 stars', 1751),
 ('4.0 out of 5 stars', 713),
 ('4.5 out of 5 stars', 626),
 ('4.6 out of 5 stars', 491),
 ('4.4 out of 5 stars', 488),
 ('4.3 out of 5 stars', 450),
 ('4.7 out of 5 stars', 412),
 ('4.2 out of 5 stars', 406),
 ('4.1 out of 5 stars', 381),
 ('3.9 out of 5 stars', 300),
 ('4.8 out of 5 stars', 272),
 ('3.8 out of 5 stars', 239),
 ('3.7 out of 5 stars', 203),
 ('3.0 out of 5 stars', 176),
 ('3.6 out of 5 stars', 145),
 ('3.5 out of 5 stars', 143),
 ('4.9 out of 5 stars', 99),
 ('3.4 out of 5 stars', 92),
 ('3.3 out of 5 stars', 74),
 ('1.0 out of 5 stars', 65),
 ('3.1 out of 5 stars', 57),
 ('3.2 out of 5 stars', 50),
 ('2.0 out of 5 stars', 44),
 ('2.9 out of 5 stars', 36),
 ('2.5 out of 5 stars', 23),
 ('2.7 out of 5 stars', 18),
 ('2.8 out of 5 stars', 12),
 ('2.3 out of 5 stars', 7),
 ('1.5 out of 5 stars', 6),
 ('2.6 out of 5 stars', 5),
 ('2.4 out of 5 stars', 5),
 ('2.2 out of 5 stars', 3),
 ('1.4 out of 5 stars', 2),
 ('1.7 out of 5 stars', 1),
 ('1

In [38]:
combined['Reviews'] = combined['Reviews'].apply(lambda x: float(x.split()[0]))

In [39]:
Counter(combined.Ratings).most_common()

[('1 customer review', 1328),
 ('2 customer reviews', 886),
 ('3 customer reviews', 613),
 ('4 customer reviews', 444),
 ('5 customer reviews', 379),
 ('6 customer reviews', 296),
 ('7 customer reviews', 231),
 ('8 customer reviews', 218),
 ('9 customer reviews', 185),
 ('10 customer reviews', 177),
 ('11 customer reviews', 153),
 ('12 customer reviews', 138),
 ('13 customer reviews', 123),
 ('15 customer reviews', 123),
 ('16 customer reviews', 119),
 ('14 customer reviews', 101),
 ('20 customer reviews', 81),
 ('17 customer reviews', 78),
 ('19 customer reviews', 71),
 ('22 customer reviews', 67),
 ('18 customer reviews', 65),
 ('30 customer reviews', 62),
 ('29 customer reviews', 56),
 ('21 customer reviews', 55),
 ('26 customer reviews', 53),
 ('27 customer reviews', 49),
 ('24 customer reviews', 47),
 ('23 customer reviews', 45),
 ('32 customer reviews', 45),
 ('33 customer reviews', 45),
 ('37 customer reviews', 41),
 ('25 customer reviews', 37),
 ('35 customer reviews', 36),
 ('

In [40]:
combined['Ratings'] = combined['Ratings'].apply(lambda x: int(''.join(x.split()[0].split(','))))

In [41]:
combined.head(2)

Unnamed: 0,Title,Author,Edition,Reviews,Ratings,Synopsis,Genre,BookCategory,Price,EditionBinding,EditionType1,EditionType,EditionDate,EditionMon,EditionYear,Mon_null,Year_null
0,the prisoner's gold (the hunters 3),Chris Kuzneski,"Paperback,– 10 Mar 2016",4.0,8,THE HUNTERS return in their third brilliant no...,Action & Adventure (Books),Action & Adventure,220.0,Paperback,10 Mar 2016,other,10 Mar 2016,first,2016,not_null,not_null
1,guru dutt: a tragedy in three acts,Arun Khopkar,"Paperback,– 7 Nov 2012",3.9,14,A layered portrait of a troubled genius for wh...,Cinema & Broadcast (Books),"Biographies, Diaries & True Accounts",202.93,Paperback,7 Nov 2012,other,7 Nov 2012,fourth,2012,not_null,not_null
