In [None]:
import numpy as np
import pandas as pd
from collections import Counter
from matplotlib import pyplot as plt

import warnings
warnings.filterwarnings('ignore')

**Importing Data**

In [None]:
train = pd.read_excel('Data_Train.xlsx')
test = pd.read_excel('Data_Test.xlsx')
train.shape

In [None]:
test.shape

In [None]:
train.BookCategory.value_counts()

In [None]:
train.Author.sort_values()

In [None]:
train.Reviews[1].split()[0]

In [None]:
train.head()

**Combining Dataset(Train + Test)** - _for cleaning and feature engineering_

In [None]:
train.Price.isnull().sum()

In [None]:
#train.drop_duplicates(['Title','Author','Edition'],inplace=True)

In [None]:
#train[train.duplicated(['Title','Author','Edition'],keep=False)].sort_values('Title')

In [None]:
train = train[train.Price<10000]

In [None]:
combined = pd.concat([train, test], sort=False)
combined.reset_index(drop=True, inplace=True)
combined.tail()

In [None]:
combined.Author.value_counts().count()

In [None]:
pd.DataFrame(combined.Author.value_counts()).sort_index()

In [None]:
from collections import Counter
Counter(combined.Author).most_common()

In [None]:
combined.shape

# Feature Cleaning & Extraction

In [None]:
combined['Title'] = combined['Title'].str.lower()


**Splitting Edition** - *to Edition Binding type and other feature*

In [None]:
combined.Edition.value_counts()

**Binning Edition Binding** - *combined edition binding ( with occurence < 9 --> "other" )*

In [None]:
Counter(combined.Edition).most_common()

In [None]:
for ed in combined['Edition']:
    if ed.find("Hardcover")!=-1: print(ed)
#train['Edition'][3].split(',')

In [None]:
for ed in combined['Edition']:
    if ed.find("Audiobook")!=-1: print(ed)
#train['Edition'][3].split(',')

In [None]:
combined.head(2)

In [None]:
combined[['EditionBinding','EditionType1']] = combined['Edition'].str.split(',– ',expand=True)
combined.head(2)

In [None]:
combined.EditionType1.value_counts()

In [None]:
combined['EditionBinding'].value_counts()

In [None]:
edition_binding_dict = combined['EditionBinding'].value_counts().to_dict()

edition_binding_dict

In [None]:
combined['EditionBinding'] = combined['EditionBinding'].apply(lambda x: (x if edition_binding_dict[x] > 9 else 'other'))


In [None]:
combined['EditionBinding'].value_counts()

**Splitting Edition remainder part** - *extracting edition date and edition type*

In [None]:
Counter(combined.EditionType1).most_common()

In [None]:
def split_edition_1(x):
    j_arr = []
    date = ''

    for j in x.split(', '):
        if not any(k.isnumeric() for k in j):
            j_arr.append(j.strip())
        else:
            date = j

    if ''.join(j_arr) != '':
        ed = ', '.join(j_arr)
    else:
        ed = 'other'

    if ed != 'Import' and ed != 'Illustrated' and ed \
        != 'Special Edition' and ed != 'Unabridged' and ed \
        != 'Student Edition' and ed != 'Box set' and ed \
        != 'International Edition' and ed != 'Abridged':
        ed_ret = 'other'
    else:
        ed_ret = ed

    return (ed_ret, date)

In [None]:
combined['EditionType'],combined['EditionDate'] = \
    zip(*combined['EditionType1'].apply(split_edition_1))

In [None]:
combined.head(2)

**Splitting Edition date** - *extracting Month & Year*

In [None]:
def split_edition_date(x):
    (mon, year) = ('', '')
    if len(x.split()) == 1:
        year = int(x)
    elif len(x.split()) == 2:
        mon = x.split()[0]
        year = int(x.split()[1])
    elif len(x.split()) == 3:
        mon = x.split()[1]
        year = int(x.split()[2])
    return (mon, year)

In [None]:
combined['EditionMon'], combined['EditionYear'] = \
    zip(*combined['EditionDate'].apply(split_edition_date))

In [None]:
combined.head(2)

**Binning Month** - *combining quaterly*

In [None]:
def bin_edition_mon(x):
    x = x.lower()
    if x == 'jan' or x == 'feb' or x == 'mar':
        return 'first'
    elif x == 'apr' or x == 'may' or x == 'jun':
        return 'second'
    elif x == 'jul' or x == 'aug' or x == 'sep':
        return 'third'
    elif x == '':
        return ''
    else:
        return 'fourth'

*making columns to mark null values*

In [None]:
combined['EditionMon'] = combined['EditionMon'].apply(bin_edition_mon)
combined['Mon_null'] = combined['EditionMon'].apply(lambda x: \
        ('not_null' if x != '' else 'null'))
combined['Year_null'] = combined['EditionYear'].apply(lambda x: \
        ('not_null' if x != '' else 'null'))

In [None]:
combined.head(2)

**Imputing Month and Year** - *by most common values*

In [None]:
combined[combined.EditionMon=='']

In [None]:
combined['EditionMon'].replace('', combined['EditionMon'].mode()[0],
                               inplace=True)
combined['EditionYear'].replace('', combined['EditionYear'].mode()[0],
                                inplace=True)

In [None]:
combined[combined.EditionMon=='']

In [None]:
combined.head()

**Extracting Reviews & Ratings** - *converting to numerical data*

In [None]:
Counter(combined.Reviews).most_common()

In [None]:
combined['Reviews'] = combined['Reviews'].apply(lambda x: float(x.split()[0]))

In [None]:
Counter(combined.Ratings).most_common()

In [None]:
combined['Ratings'] = combined['Ratings'].apply(lambda x: int(''.join(x.split()[0].split(','))))

In [None]:
combined.head(2)

# Feature Engineering 
*Engineering new features*

**Ratings and Reviews Ratio**

In [None]:
combined['RatingPerReview'] = round(combined['Ratings']/combined['Reviews'], 2)

In [None]:
combined.Ratings.describe()

In [None]:
%matplotlib inline
combined.Ratings.plot(kind='box');

In [None]:
combined[combined.Ratings>2500]

In [None]:
combined.head(2)

**Impact of Book Age on Reviews**

In [None]:
combined['Review_Year_Impact'] = combined['Reviews'] * combined['EditionYear'].apply(lambda x: 2019 - x)

**Author Name Cleaning**

In [None]:
combined.Author.head(100)

In [None]:
author_replacements = {' & ':', ',"0":"other","2":"other",'A. P. J. Abdul Kalam':'A.P.J. Abdul Kalam','APJ Abdul Kalam':'A.P.J. Abdul Kalam','Agrawal P. K.': 'Agrawal P.K','Ajay K Pandey': 'Ajay K. Pandey','Aravinda Anantharaman': 'Aravinda Anatharaman','Arthur Conan Doyle': 'Sir Arthur Conan Doyle','B A Paris': 'B. A. Paris','E L James': 'E. L. James','E.L. James':'E. L. James','Eliyahu M Goldratt': 'Eliyahu M. Goldratt','Ernest Hemingway': 'Ernest Hemmingway','Frank Miler': 'Frank Miller','Fyodor Dostoevsky': 'Fyodor Dostoyevsky','George R R Martin': 'George R. R. Martin','George R.R. Martin':'George R. R. Martin','H. G. Wells': 'H.G. Wells','Johann Wolfgang Von Goethe': 'Johann Wolfgang von Goethe','John Le Carré': 'John le Carré','Judith McNaught': 'Judith Mcnaught','Keith Giffen': 'Kieth Giffen','Ken Hultgen': 'Ken Hultgren','Kentaro Miura': 'Kenturo Miura','Kohei Horikoshi': 'Kouhei Horikoshi','M.K Gandhi': 'M.K. Gandhi','Matthew K Manning': 'Matthew Manning','Michael Crichton': 'Micheal Crichton','N.K Aggarwala': 'N.K. Aggarwala','Oxford University Press (India)': 'Oxford University Press India','P D James': 'P. D. James','Paramahansa Yogananda': 'Paramhansa Yogananda','R K Laxman': 'R. K. Laxman','R.K. Laxman': 'R. K. Laxman','R. M. Lala': 'R.M. Lala','Raina Telgemaeier': 'Raina Telgemeier','Rajaraman': 'Rajaraman V','Rajiv M. Vijayakar': 'Rajiv Vijayakar','Ramachandra Guha': 'Ramchandra Guha','Rene Goscinny': 'René Goscinny','Richard P Feynman': 'Richard P. Feynman','S Giridhar': 'S. Giridhar','S Hussain Zaidi': 'S. Hussain Zaidi','S. A. Chakraborty': 'S. Chakraborty','Santosh Kumar K': 'Santosh Kumar K.',"S.C. Gupta" : "S. C. Gupta",'Shiv Prasad Koirala': 'Shivprasad Koirala','Shivaprasad Koirala': 'Shivprasad Koirala','Simone De Beauvoir': 'Simone de Beauvoir','Sir Arthur Conan Doyle': 'Arthur Conan Doyle',"Terry O' Brien": "Terry O'Brien",'Thich Nhat Hahn': 'Thich Nhat Hanh','Trinity College Lond': 'Trinity College London',"Trinity College London Press" : "Trinity College London",'Ursula K. Le Guin': 'Ursula Le Guin','Willard A Palmer': 'Willard A. Palmer','Willard Palmer': 'Willard A. Palmer','William Strunk Jr': 'William Strunk Jr.','Yashavant Kanetakr': 'Yashavant Kanetkar','Yashavant P. Kanetkar': 'Yashavant Kanetkar','Yashwant Kanetkar': 'Yashavant Kanetkar','et al': 'et al.',' et al': 'et al.','Peter Clutterbuck': ' Peter Clutterbuck','Scholastic': 'Scholastic ','Ullekh N. P.': 'Ullekh N.P.','Shalini Jain': 'Dr. Shalini Jain','Kevin Mitnick': 'Kevin D. Mitnick'}
combined['Author'] = combined['Author'].replace(author_replacements,regex=True)

**No. of Authors of a book**

In [None]:
combined['Authors_count'] = combined['Author'].apply(lambda x: \
        len(x.split(',')))

**Average Author reviews**

In [None]:
author_avg_review_dict = round(combined[combined.Authors_count== 1]
                               .groupby('Author',sort=False)['Reviews']
                               .mean(), 2).to_dict()
author_avg_review_dict

In [None]:
def check_author(x):
    reviews = []
    for name in x.split(', '):
        try:
            reviews.append(author_avg_review_dict[name])
        except:
            pass
    if len(reviews) != 0:
        return sum(reviews) / len(reviews)
    else:
        return ''

In [None]:
combined['AuthorAvgReview'] = combined['Author'].apply(check_author)
combined['AuthorAvgReview'] = combined[['Reviews', 'AuthorAvgReview']]\
        .apply(lambda x: (x[0] if x[1] == '' else x[1]), axis=1)

**No. of Books from an Author**

In [None]:
combined['Count_Author_Title'] = combined['Author'].map(combined.groupby('Author',sort=False)['Title'].apply(lambda x: len(x.unique())).to_dict())

**No. of occurences of a Title**
<br>
**Average:** 
- Book - Author Count
- Title - reviews

In [None]:
combined['MEAN_Title_Authors_count'] = round(combined
                                            .groupby('Title',sort=False)['Authors_count']
                                            .transform('mean'), 2)

combined['MEAN_Ttle_Reviews'] = round(combined
                                      .groupby('Title',sort=False)['Reviews']
                                      .transform('mean'), 2)

combined['Title_count'] = combined.groupby('Title',sort=False)['Title']\
                                  .transform('count')

**Various Categories of a book**

In [None]:
title_cat_dict = combined[combined.Authors_count == 1]\
                 .groupby('Title',sort=False)['BookCategory']\
                 .apply(lambda x: ', '.join(x)).to_dict()
combined['TitleCategories'] = combined['Title'].map(title_cat_dict)
combined['TitleCategories'] = combined[['BookCategory','TitleCategories']]\
                              .apply(lambda x: (x[0] if pd.isna(x[1]) else x[1]),axis=1)

**Various Genres of a book**

In [None]:
title_genre_dict = combined[combined.Authors_count == 1]\
                   .groupby('Title',sort=False)['Genre']\
                   .apply(lambda x: ', '.join(x)).to_dict()
combined['TitleGenres'] = combined['Title'].map(title_genre_dict)
combined['TitleGenres'] = combined[['Genre', 'TitleGenres']]\
                          .apply(lambda x: (x[0] if pd.isna(x[1]) else x[1]), axis=1)

**Various Category books written by an author**

In [None]:
author_cat_dict = combined[combined.Authors_count==1]\
                 .groupby('Author',sort=False)['BookCategory']\
                 .apply(lambda x: ', '.join(x)).to_dict()
combined['AuthorCategories'] = combined['Author'].map(author_cat_dict)
combined['AuthorCategories'] = combined[['BookCategory','AuthorCategories']]\
                               .apply(lambda x: x[0] if pd.isna(x[1]) else x[1],axis=1)

**Various Genre books written by an author**

In [None]:
author_genre_dict = combined[combined.Authors_count==1]\
                    .groupby('Author',sort=False)['Genre']\
                    .apply(lambda x: ', '.join(x)).to_dict()
combined['AuthorGenres'] = combined['Author'].map(author_genre_dict)
combined['AuthorGenres'] = combined[['Genre','AuthorGenres']]\
                           .apply(lambda x: x[0] if pd.isna(x[1]) else x[1],axis=1)

In [None]:
combined['TitleGenres'] = combined['TitleGenres'].str.replace(' & ',', ')
combined['AuthorGenres'] = combined['AuthorGenres'].str.replace(' & ',', ')
combined['Genre'] = combined['Genre'].str.replace(' & ',', ')

**Binning Edition Year ** - *by distribution over years*

In [None]:
combined['EditionYearBin'] = pd.qcut(combined['EditionYear'],5,labels=False)

In [None]:
combined.head()

## Dummy & Count Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
combined[['BookCategory','EditionBinding','EditionMon','EditionType','EditionYearBin',\
          'Mon_null','Year_null',]] = combined[['BookCategory','EditionBinding','EditionMon',\
    'EditionType','EditionYearBin','Mon_null','Year_null',]].apply(enc.fit_transform)

In [None]:
combined.head(2)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

tc_vectorizer = CountVectorizer(lowercase=True, tokenizer=lambda x: \
                                 x.split(', '))
title_categories_vector = tc_vectorizer.fit_transform(combined['TitleCategories']).toarray()
data_title_categories = pd.DataFrame(data=title_categories_vector,
                      columns=tc_vectorizer.get_feature_names())

In [None]:
ac_vectorizer = CountVectorizer(lowercase=True, 
                                 tokenizer=lambda x: x.split(', '))
author_categories_vector = ac_vectorizer.fit_transform(combined['AuthorCategories']).toarray()
data_author_categories = pd.DataFrame(data=author_categories_vector,
                      columns=ac_vectorizer.get_feature_names())

In [None]:
tg_vectorizer = CountVectorizer(max_features=10, lowercase=True,
                                tokenizer=lambda x: x.split(', '))
title_genres_vector = tg_vectorizer.fit_transform(combined['TitleGenres']).toarray()
data_title_genres = pd.DataFrame(data=title_genres_vector,
                     columns=tg_vectorizer.get_feature_names())

In [None]:
ag_vectorizer = CountVectorizer(max_features=10, lowercase=True,
                                    tokenizer=lambda x: x.split(', '))
author_genres_vector = ag_vectorizer.fit_transform(combined['AuthorGenres']).toarray()
data_author_genres = pd.DataFrame(data=author_genres_vector,
                         columns=ag_vectorizer.get_feature_names())

In [None]:
title_vectorizer = CountVectorizer(max_features=10, lowercase=True)
title_vector = title_vectorizer.fit_transform(combined['Title']).toarray()
data_title = pd.DataFrame(data=title_vector,
                        columns=title_vectorizer.get_feature_names())

In [None]:
vectorizer_author = CountVectorizer(max_features=10, lowercase=True,
                                    tokenizer=lambda x: x.split(', '))
vector_author = vectorizer_author.fit_transform(combined['Author']).toarray()
data_author = pd.DataFrame(data=vector_author,
                         columns=vectorizer_author.get_feature_names())

In [None]:
vectorizer_genre = CountVectorizer(max_features=10,
                                   lowercase=True, tokenizer=lambda x: x.split(', '))
vector_genre = vectorizer_genre.fit_transform(combined['Genre']).toarray()
data_genre = pd.DataFrame(data=vector_genre,
                        columns=vectorizer_genre.get_feature_names())

In [None]:
vectorizer_synopsis = CountVectorizer(max_features=10,
                                      stop_words='english', 
                                      strip_accents='ascii', 
                                      lowercase=True)
vector_synopsis = vectorizer_synopsis.fit_transform(combined['Synopsis']).toarray()
data_synopsis = pd.DataFrame(data=vector_synopsis,
                           columns=vectorizer_synopsis.get_feature_names())

In [None]:
combined.drop(columns=[
    'Title',
    'Author',
    'Genre',
    'Synopsis',
    'Edition',
    'EditionDate',
    'EditionType1',
    'AuthorCategories',
    'AuthorGenres',
    'TitleGenres',
    'TitleCategories'
    ], inplace=True)

In [None]:
print('No. of Features:',combined.shape[1])

**Feature correlations**

In [None]:
data = pd.concat([
    combined,# dummy encoded features
    data_author, # author count encoded
    data_genre, # genre count encoded
    data_title, # title count encoded
    data_synopsis, # synopsis count encoded
   data_author_genres, # author_genres count encoded
   data_title_genres, # title_genres count encoded
    data_author_categories, # author_categories count encoded
    data_title_categories, # title_categories count encoded
    ], axis=1)
data.reset_index(drop=True, inplace=True)

In [None]:
#feature correlations
corr = data.corr()
corr[corr.Price>0.01].Price.sort_values()

In [None]:
data.shape  #  features count

In [None]:
data.head()

In [None]:
print(data.info());

# Train - Test Split

In [None]:
train = data[data['Price'].notna()]
test = data[data['Price'].isna()]
test.drop(['Price'], axis=1, inplace=True)

In [None]:
X = train.loc[:, train.columns != 'Price'].values
X = X.astype(float)

# Dependent Variable

y = np.log1p(train['Price'].values)
y = y.astype(float)

# Test - (Independent Variables)

test = test.loc[:].values
test = test.astype(float)

# Model Training

**Importing libraries**

In [None]:
import xgboost as xgb
#import lightgbm as lgb
from sklearn.metrics import make_scorer
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, VotingRegressor, AdaBoostRegressor

**RMLSE scoring func**

In [None]:
def score(y_true, y_pred):
    y_pred = np.exp(y_pred) - 1
    for i in range(len(y_pred)):
        if y_pred[i] < 0:
            y_pred[i] = 0
    y_true = np.exp(y_true) - 1
    error = np.square(np.log10(y_pred + 1) - np.log10(y_true
                      + 1)).mean() ** 0.5
    score = 1 - error
    return score

#### RandomForestRegressor

In [None]:
from sklearn.model_selection import RandomizedSearchCV
params = { 'random_state':range(0,4),
           'n_estimators':np.arange(10,100,10),
           'max_depth':np.arange(5,30,5),
           'max_features':['sqrt']
         }
rf = RandomizedSearchCV(RandomForestRegressor(),params,scoring=make_scorer(score,greater_is_better=True))
rf.fit(X,y)
rf.best_score_
#rf = RandomForestRegressor(random_state=0,n_estimators=150,max_features='sqrt')
#cvs = cross_val_score(rf, X, y, cv=5,verbose=2,n_jobs=-1,
#                      scoring=make_scorer(score,greater_is_better=True))
#print("Average Score:",np.mean(cvs))

In [None]:
rf.best_params_

# SVR

In [None]:
from sklearn.svm import SVR
svr = SVR(kernel='rbf',gamma=0.003,C=1)

cvs = cross_val_score(svr, X, y, cv=5,verbose=2,n_jobs=-1,
                      scoring=make_scorer(score,greater_is_better=True))
print("Average Score:",np.mean(cvs))
# Not working with SVR

In [None]:
rf1 = RandomForestRegressor(random_state=3,
 n_estimators=80,
 max_features='sqrt',       
 max_depth=30)

cvs = cross_val_score(rf1, X, y, cv=5,verbose=2,n_jobs=-1,
                      scoring=make_scorer(score,greater_is_better=True))
print("Average Score:",np.mean(cvs))

In [None]:
rf1.fit(X,y)
pd.DataFrame(rf1.feature_importances_).sort_values(0,ascending=False)

#### LGBMRegressor

In [None]:
lgbm = lgb.LGBMRegressor()

params = { 'random_state':range(0,4),
           'n_estimators':np.arange(10,100,10),
           'max_depth':np.arange(5,30,5),
           'max_features':['sqrt']
         }
lgbm = RandomizedSearchCV(lgbm,params,scoring=make_scorer(score,greater_is_better=True))
lgbm.fit(X,y)
print(lgbm.best_params_)
print(lgbm.best_score_)

#cvs = cross_val_score(lgbm, X, y, cv=5,verbose=2,n_jobs=-1,
#                      scoring=make_scorer(score,greater_is_better=True))
#print("Average Score:",np.mean(cvs))

#### XGBRegressor

In [None]:
xgb = XGBRegressor( )

params = { 'random_state':range(0,4),
           'n_estimators':np.arange(10,100,10),
           'max_depth':np.arange(5,30,5),
           'max_features':['sqrt'],
          'colsample_bytree' : [0.6], 
           'objective':['reg:squarederror'],
          'learning_rate' : [0.1,0.2,0.05], 
          'max_depth' : [5,10,15,30], 
          'alpha' : [10]
         }
xgb = RandomizedSearchCV(xgb,params,scoring=make_scorer(score,greater_is_better=True))
xgb.fit(X,y)
xgb.best_score_

#### VotingRegressor

In [None]:
vr = VotingRegressor([('rf', rf), ('xgb', xgb)])

cvs = cross_val_score(vr, X, y, cv=5,verbose=50,n_jobs=-1,
                        scoring=make_scorer(score,greater_is_better=True))
print("Average Score:",np.mean(cvs))

# Exporting Predictions

In [None]:
vr.fit(X, y)
Y_pred2 = vr.predict(test)
Y_pred2 = np.exp(Y_pred2)-1

for i in range(len(Y_pred2)):
       if Y_pred2[i] < 0:
            Y_pred2[i] = 0

pd.DataFrame(Y_pred2, columns = ['Price']).to_excel("predictions.xlsx", index=False)

# Tips

In [None]:
# Reduce countvectorization columns by using selectpercentile
from sklearn.feature_selection import SelectPercentile, f_classif
selector = SelectPercentile(f_classif, percentile=10)

In [None]:
stacking of the models.
Data Leaks