In [1]:
import pandas as pd

df = pd.read_csv('scraping_results2.csv', index_col=0)

price_mean = df.mean()[0].round(2)

books_with_the = df.loc[(df['Book Title'].str.contains('^The'))]
books_with_the.head()

book_per_page = df.groupby(['Page Found']) \
                  .agg({'Book Title':'size', 'Book Price':'mean', 'Book Rating':'mean'}) \
                  .rename(columns={'Book Title':'Book per Page','Book Price':'Mean Page Price', 'Book Rating':'Mean Page Rating'}) \
                  .round(2)

df

Unnamed: 0,Book Title,Book Price,Book Rating,Page Found
0,A Light in the Attic,51.77,3,1
1,Tipping the Velvet,53.74,1,1
2,Soumission,50.10,1,1
3,Sharp Objects,47.82,4,1
4,Sapiens: A Brief History of Humankind,54.23,5,1
...,...,...,...,...
995,Alice in Wonderland (Alice's Adventures in Won...,55.53,1,50
996,"Ajin: Demi-Human, Volume 1 (Ajin: Demi-Human #1)",57.06,4,50
997,A Spy's Devotion (The Regency Spies of London #1),16.97,5,50
998,1st to Die (Women's Murder Club #1),53.98,1,50


In [2]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import numpy as np

y = df['Book Price']
X = df['Book Rating']

y = np.array(y).reshape(-1, 1).ravel()
X = np.array(X).reshape(-1, 1)

In [3]:
my_pipeline = Pipeline(steps=[
    ('model', RandomForestRegressor(n_estimators=15, random_state=0, min_samples_split=10))
    ])

In [4]:
scores = -1 * cross_val_score(my_pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')
scores.min()

11.726427961867872

In [5]:
train_X, valid_X, train_y, valid_y = train_test_split(X, y, random_state=1)

In [6]:
first_model = RandomForestRegressor(n_estimators=15, random_state=0, min_samples_split=10)
first_model.fit(train_X, train_y)

RandomForestRegressor(min_samples_split=10, n_estimators=15, random_state=0)

In [7]:
price_predict = first_model.predict(valid_X)

In [8]:
from sklearn.metrics import mean_absolute_error

val_mae = mean_absolute_error(valid_y, price_predict)
val_mae

13.182953044077758

In [9]:
from xgboost import XGBRegressor

xgb_model = XGBRegressor(n_estimators=30)
xgb_model.fit(train_X, train_y, verbose=False)

prediction = xgb_model.predict(valid_X)

error = mean_absolute_error(prediction, valid_y)
error

13.189026799926758