In [1]:
# Load basic libraries
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA
from scipy.stats import zscore

import matplotlib.pyplot as plt
import seaborn as sns

from wordcloud import WordCloud
from textblob import TextBlob
import spacy
# !python -m spacy download en_core_web_sm

In [2]:
df = pd.read_excel("Data_Train.xlsx")
print(df.columns)
print(df.shape)

Index(['Title', 'Author', 'Edition', 'Reviews', 'Ratings', 'Synopsis', 'Genre',
       'BookCategory', 'Price'],
      dtype='object')
(5699, 9)


In [3]:
df_test = pd.read_excel("test.xlsx")
print(df_test.columns)
print(df_test.shape)

Index(['Unnamed: 0', 'Title', 'Author', 'Edition', 'Reviews', 'Ratings',
       'Synopsis', 'Genre', 'BookCategory'],
      dtype='object')
(537, 9)


<h4>Detect Outliers</h4>
Identify outliers using the Z-score method

In [4]:
# z_scores = pd.DataFrame(zscore(df['Price']), columns=['Price'])

# threshold = 2
# outliers = df[(z_scores.abs() > threshold).any(axis=1)]

# print("Outlier records:")
# print(outliers.shape)

In [5]:
# df_with_outlier = df.copy()
# df = df[(z_scores.abs() < threshold).all(axis=1)].reset_index(drop=True)

# print("DataFrame without outliers:")
# print(df.shape)

<h4>Combine Datasets</h4>

In [6]:
# Test Dataset: df[5699:]
df_train = df.copy()
df_test.drop(columns=['Unnamed: 0'], inplace=True)

df = pd.concat([df_train, df_test], sort=False)
df.reset_index(drop=True, inplace=True)
# df

<h4>Delete Unnecessary Text

In [7]:
df['Reviews'] = df['Reviews'].str[:3].astype(float)
df['Ratings'] = df['Ratings'].str.extract('(\\d+)').astype(int)
df.head(2)

Unnamed: 0,Title,Author,Edition,Reviews,Ratings,Synopsis,Genre,BookCategory,Price
0,The Prisoner's Gold (The Hunters 3),Chris Kuzneski,"Paperback,– 10 Mar 2016",4.0,8,THE HUNTERS return in their third brilliant no...,Action & Adventure (Books),Action & Adventure,220.0
1,Guru Dutt: A Tragedy in Three Acts,Arun Khopkar,"Paperback,– 7 Nov 2012",3.9,14,A layered portrait of a troubled genius for wh...,Cinema & Broadcast (Books),"Biographies, Diaries & True Accounts",202.93


In [8]:
print("Unique Authors: " + str(df['Author'].nunique()))
print("Unique Titles: " + str(df['Title'].nunique()))

Unique Authors: 3678
Unique Titles: 5567


In [9]:
# df['Title'] = df['Title'].str.replace('[^a-zA-Z0-9]', '', regex=True).str.lower()
df['Author'] = df['Author'].str.replace('[^a-zA-Z0-9]', '', regex=True).str.lower()
df.head(2)

Unnamed: 0,Title,Author,Edition,Reviews,Ratings,Synopsis,Genre,BookCategory,Price
0,The Prisoner's Gold (The Hunters 3),chriskuzneski,"Paperback,– 10 Mar 2016",4.0,8,THE HUNTERS return in their third brilliant no...,Action & Adventure (Books),Action & Adventure,220.0
1,Guru Dutt: A Tragedy in Three Acts,arunkhopkar,"Paperback,– 7 Nov 2012",3.9,14,A layered portrait of a troubled genius for wh...,Cinema & Broadcast (Books),"Biographies, Diaries & True Accounts",202.93



## Feature Engineering



<h4>Genrate new features</h4>

In [10]:
df['CoverType'] = df['Edition'].str.split(',').str[0]
df['CoverType'].unique()

array(['Paperback', 'Hardcover', 'Mass Market Paperback', 'Sheet music',
       'Flexibound', 'Plastic Comb', 'Loose Leaf', 'Tankobon Softcover',
       'Perfect Paperback', 'Board book', 'Cards', 'Spiral-bound',
       '(Kannada)', 'Product Bundle', 'Library Binding', '(German)',
       'Leather Bound', '(French)', '(Spanish)'], dtype=object)

In [11]:
df['Year'] = df['Edition'].str.extract(r'(\d{4})').fillna(2019).astype(int)
df['Year'].nunique()

56

In [12]:
df['BookAge'] = 2020 - df['Year']
df['BookAge'].max()

120

In [13]:
df['ReviewImpact'] = df['BookAge'] * df['Reviews']
df['ReviewImpact'].sort_values()

3652      1.0
1024      1.0
467       1.0
2848      1.0
767       2.0
        ...  
2764    282.0
2769    300.0
5164    389.5
3664    414.0
3764    468.0
Name: ReviewImpact, Length: 6236, dtype: float64

In [14]:
df['RatingImpact'] = df['Reviews'].astype(int) * df['Ratings'].astype(int) / df['BookAge'].astype(int)
df['RatingImpact'].sort_values()

3414       0.030303
299        0.034483
1855       0.038462
3908       0.038462
3267       0.041667
           ...     
3960    1516.000000
968     1630.000000
962     3444.000000
3148    3608.000000
933     3608.000000
Name: RatingImpact, Length: 6236, dtype: float64

<h4>Utilize NLP to extract Keywords and Sentiment</h4>

In [15]:
df['Synopsis'] = df['Synopsis'].str.lower()

In [16]:
df["Polarity"] =  df['Synopsis'].apply(lambda x: TextBlob(x).sentiment.polarity)
df["Subjectivity"] = df['Synopsis'].apply(lambda x: TextBlob(x).sentiment.subjectivity)

In [17]:
nlp = spacy.load("en_core_web_md")

# Function to extract keywords from Synopsis using SpaCy
def extract_keywords(text):
    doc = nlp(text)
    return [token.text for token in doc if not token.is_stop and token.is_alpha]

df['Keywords'] = df['Title'].apply(extract_keywords)
# df['Keywords'] = df['Keywords'].apply(lambda x: ' '.join(x))


In [18]:
from collections import Counter

all_keywords = [keyword for sublist in df['Keywords'] for keyword in sublist]
keyword_counts = Counter(all_keywords)
top_keywords = keyword_counts.most_common(200)

print(top_keywords)

for i in range(len(top_keywords)): 
    top_keywords[i] = top_keywords[i][0]
    
print(top_keywords)
top_keywords_df = pd.DataFrame(top_keywords, columns=['Keyword'])
print(top_keywords_df)


[('Book', 385), ('Guide', 224), ('Classics', 188), ('Edition', 185), ('English', 185), ('India', 183), ('World', 166), ('Vol', 140), ('Life', 134), ('Penguin', 125), ('Story', 125), ('Series', 124), ('Novel', 118), ('Man', 117), ('Complete', 109), ('Art', 95), ('New', 94), ('Dictionary', 92), ('Love', 91), ('Graphic', 89), ('Adventures', 88), ('Oxford', 86), ('Indian', 84), ('Modern', 81), ('Stories', 77), ('History', 73), ('Data', 68), ('Learning', 68), ('Grammar', 65), ('Asterix', 64), ('Volume', 63), ('Big', 62), ('Vintage', 61), ('Secret', 61), ('Great', 60), ('Design', 58), ('Novels', 58), ('Autobiography', 56), ('Books', 55), ('Course', 55), ('Practice', 54), ('Easy', 54), ('CD', 54), ('Girl', 51), ('Trilogy', 51), ('Learn', 50), ('Programming', 50), ('Way', 50), ('Tintin', 49), ('Cambridge', 49), ('Death', 48), ('Science', 48), ('War', 48), ('Words', 48), ('Game', 48), ('series', 46), ('Journey', 46), ('Step', 45), ('Fire', 45), ('Library', 45), ('Grade', 44), ('Mind', 44), ('Pi

## Feature transformation

In [19]:
# Use TF-IDF Vectorizer for Keywords
df['Keywords'] = df['Keywords'].apply(lambda x: ' '.join(x))

tfidf_vectorizer_title = TfidfVectorizer()
tfidf_matrix_title = tfidf_vectorizer_title.fit_transform(df['Keywords'])

n_components = 50
pca = PCA(n_components=n_components)
tfidf_matrix_title = pca.fit_transform(tfidf_matrix_title.toarray())

In [20]:
nlp = spacy.load("en_core_web_md")

# Function to extract keywords from Synopsis using SpaCy
def extract_keywords(text):
    doc = nlp(text)
    return [token.text for token in doc if not token.is_stop and token.is_alpha]

df['SynopsisKeywords'] = df['Synopsis'].apply(extract_keywords)
backup = df['SynopsisKeywords'].copy()

In [21]:
backup.apply(lambda x: ' '.join(x)).str.lower()

0       hunters return brilliant novel sunday times be...
1       layered portrait troubled genius art merely th...
2       time men live common power awe condition calle...
3       handful grain found pocket murdered businessma...
4       seven decades life thrilling world unrivalled ...
                              ...                        
6231    brilliant sarah knight funny mark watson exhil...
6232    gripping page turner ex agent run employers ca...
6233    refreshing radiant love story read year lisa k...
6234    frostfire amanda hocking stunning installment ...
6235    years ago sam capra watched brother danny exec...
Name: SynopsisKeywords, Length: 6236, dtype: object

In [22]:
# Use TF-IDF Vectorizer for Keywords
df['SynopsisKeywords'] = backup.apply(lambda x: ' '.join(x)).str.lower()

tfidf_vectorizer_synopsis = TfidfVectorizer()
tfidf_matrix_synopsis = tfidf_vectorizer_synopsis.fit_transform(df['SynopsisKeywords'])

n_components = 50
pca = PCA(n_components=n_components)
tfidf_matrix_synopsis = pca.fit_transform(tfidf_matrix_synopsis.toarray())

In [23]:
# keywords_df = keywords_df.copy()
# keywords_df.fillna(0, inplace=True)
# keywords_df.head(3)

In [24]:
# Use One-hot encoding for Author
top_authors = df['Author'].value_counts().head(30).index.tolist()

# Create new features for each of the top authors

author_df = pd.DataFrame()

for author in top_authors:
    author_df[author] = (df['Author'] == author).astype(int)

In [25]:
# Use One-hot encoding for Cover Types
top_ctypes = df['CoverType'].value_counts().head(5).index.tolist()

type_df = pd.DataFrame()

# Create new features for each of the top authors
for ctype in top_ctypes:
    type_df[ctype] = (df['CoverType'] == author).astype(int)

In [26]:
# Use One-hot encoding for BookCategory
encoded_category = pd.get_dummies(df['BookCategory'], columns=['BookCategory'], prefix='BookCategory')

In [27]:
# Combine all numerical features
combined_features = pd.concat([
    df[['Reviews', 'Ratings', 'Year', 'Polarity', 'Subjectivity', 'BookAge', 'ReviewImpact', 'RatingImpact']],
    pd.DataFrame(tfidf_matrix_title),
    pd.DataFrame(tfidf_matrix_synopsis),
    encoded_category,
    author_df,
    type_df
    ], axis=1)

In [28]:
# combined_features['RatingImpact'] = df['RatingImpact']

In [29]:
def convert_to_numeric(column):
    return pd.to_numeric(column, errors='coerce')

# combined_features = combined_features.apply(convert_to_numeric)
combined_features.columns = combined_features.columns.astype(str)

# combined_features.fillna(0, inplace=True)

In [30]:
from sklearn.preprocessing import StandardScaler

# Create a MinMaxScaler object
scaler = StandardScaler()

# Specify the columns you want to normalize
columns_to_normalize = combined_features.columns.to_list()

# Fit and transform the selected columns using Min-Max scaling
combined_features[columns_to_normalize] = scaler.fit_transform(combined_features[columns_to_normalize])

In [31]:
combined_features.shape

(6236, 154)

## Modeling

Do not change this part of the code only run it!

In [40]:
def train(X_train, y_train, X_test, y_test):
    """
    Trains a RandomForestRegressor model and evaluates its performance using the mean squared error (MSE).

    Parameters:
    X (numpy.ndarray or pandas.DataFrame): The training data with (n_rows, n_features) shape.
    y (numpy.ndarray or pandas.Series): The target variable (n_rows, 1) shape.

    Returns:
    float: The mean squared error (MSE) of the predictions(train data) made by the RandomForestRegressor.
    float: The mean squared error (MSE) of the predictions(test data) made by the RandomForestRegressor.
    """
    random_forest_regressor = RandomForestRegressor(criterion='squared_error')
    random_forest_regressor.fit(X_train, y_train)
    mse_train = random_forest_regressor.score(X_train, y_train)
    
    y_pred = random_forest_regressor.predict(X_test)

    mse_test = mean_squared_error(y_test, y_pred)

    return mse_train, mse_test

In [41]:
# X_train, X_test, y_train, y_test = train_test_split(combined_features[:5270], df[:5270]['Price'], test_size=0.2, random_state=42)

In [42]:
# mse_train, mse_test = train(X_train=X_train , y_train=y_train , X_test=X_test , y_test=y_test)
# print("Train mse is: {} // Test mse is: {}".format(mse_train,mse_test))

<h4>Predict Prices based on test data</h4>

In [43]:
random_forest_regressor = RandomForestRegressor(criterion='squared_error')
random_forest_regressor.fit(combined_features[:5699], df['Price'][:5699])
prediction = random_forest_regressor.predict(combined_features[5699:])

In [44]:
prediction_df = pd.DataFrame(prediction).reset_index()
prediction_df.rename(columns={0: 'Price'}, inplace=True)
prediction_df.head()

Unnamed: 0,index,Price
0,0,483.3497
1,1,750.15735
2,2,355.9569
3,3,466.5668
4,4,564.4952


In [45]:
prediction_df.to_csv('Mahan Madani - Prediction.csv', index=False)