In [134]:
import pandas as pd
data_train=pd.read_csv('../input/hse-aml-2022/books_train.csv', delimiter=',')
data_test=pd.read_csv('../input/hse-aml-2022/books_test.csv', delimiter=',')
data_train.columns = data_train.columns.str.replace(' ', '')
data_test.columns = data_test.columns.str.replace(' ', '')
print(data_train.columns, data_test.columns)

**Data cleaning**

In [135]:
data_train.duplicated().any()

In [136]:
# Add a new feature which has the number of occurences of each book
data_train['num_occ'] = data_train.groupby('title')['title'].transform('count')

In [137]:
from sklearn import preprocessing
from sklearn.preprocessing import OrdinalEncoder

encoding = {'language_code':{'en-US': 'eng', 'en-GB': 'eng', 'en-CA': 'eng'}} # Unify the langauge codes
data_train.replace(encoding, inplace=True)

enc = OrdinalEncoder()
enc.fit(data_train[['language_code']])
data_train[['language_code']] = enc.fit_transform(data_train[['language_code']]) # Apply ordinal encoding on language_code to convert it into numerical column
data_train[['language_code']]

In [138]:
from sklearn import preprocessing
from sklearn.preprocessing import OrdinalEncoder
# encode title column
le = preprocessing.LabelEncoder()
data_train['publication_date'] = le.fit_transform(data_train['publication_date'])

In [139]:
data_train.loc[data_train.bookID == 31373, 'publication_date'] = '1999-10-01 00:00:00'
data_train.loc[data_train.bookID == 45531, 'publication_date'] = '1975-10-01 00:00:00'

In [140]:
data_train['year'] = pd.DatetimeIndex(data_train['publication_date']).year # Extract year of publication in a separate column

In [141]:
#Calculating New Features
data_train['rate_occ'] = data_train['average_rating'] * data_train['num_occ']

In [142]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
fig = plt.gcf()
fig.set_size_inches(16, 8)
corr = data_train.corr()
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(200, 30, as_cmap=True),
            square=True)

In [143]:
sns.relplot(x="num_occ", y="average_rating", data=data_train, height=5, aspect = 2)

In [144]:
sns.relplot(x="text_reviews_count", y="average_rating", data=data_train, height=5, aspect = 2)

In [145]:
sns.relplot(x="num_pages", y="average_rating", data=data_train, height=5, aspect = 2)

In [146]:
label = data_train['average_rating'].values
data_train.drop(['title', 'authors', 'isbn', 'isbn13', 'publication_date', 'publisher', 'average_rating'], axis=1, inplace=True)

Splitting the Data, where is test size - 30%

In [147]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_train, label, test_size=0.3)

**Using Ada Boost Model**

In [148]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4))

parameters = {
    'learning_rate': [0.001, 0.01, 0.02, 0.1, 0.2, 1.0],
    'n_estimators': [10, 50, 100, 200]
}

grad_Ada = GridSearchCV(model, parameters, refit=True)
grad_Ada.fit(X_train, y_train)

print('Best Score: ', grad_Ada.best_score_*100, '\nBest Parameters: ', grad_Ada.best_params_)

In [149]:
l = []
l.append(('AdaBoost', grad_Ada.best_score_*100))

scores = pd.DataFrame(l, columns =['Model', 'Train Score'])
scores

In [150]:
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score

In [151]:
# AdaBoost Model
pred_adaboost = grad_Ada.predict(X_test)
print('Model Score on Test Data: ', grad_Ada.score(X_test, y_test))

In [152]:
plt.figure(figsize=(19,10))
sns.regplot(pred_adaboost, y_test, marker="+", line_kws={'color':'darkred','alpha':1.0})

In [153]:
l2 = []
l2.append(('AdaBoost', grad_Ada.score(X_test, y_test)*100))

test_scores = pd.DataFrame(l2, columns =['Model', 'Test Score'])

In [154]:
scores['Test Score'] = test_scores['Test Score']
scores

In [163]:
data_train['average_rating']=pd.Series(pred_adaboost)

In [164]:
data_train

In [168]:
k=data_train[['bookID', 'average_rating']]
k

In [169]:
from IPython.display import HTML
import base64

def create_download_link( df, title = "CSV file", filename = "data1.csv"):  
    csv = k.to_csv()
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

create_download_link(df)

In [158]:
data_train.to_csv(r'C:\Users\User\Documents\machineLearning\courses\hse-aml\hw2_ml.csv', index=False)

In [159]:
df = data_train[['average_rating']]
df

**Uploading data_test and repeating the data changing**

In [160]:
data_test=pd.read_csv('../input/hse-aml-2022/books_test.csv')
data_test.columns = data_test.columns.str.replace(' ', '')
data_test.duplicated().any()

In [161]:
data_test['num_occ'] = data_test.groupby('title')['title'].transform('count')

encoding = {'language_code':{'en-US': 'eng', 'en-GB': 'eng', 'en-CA': 'eng'}}
data_test.replace(encoding, inplace=True)

enc = OrdinalEncoder()
enc.fit(data_test[['language_code']])
data_test[['language_code']] = enc.fit_transform(data_test[['language_code']])
data_test[['language_code']]
# encode title column
le = preprocessing.LabelEncoder()
data_test['publication_date'] = le.fit_transform(data_test['publication_date'])

data_test['year'] = pd.DatetimeIndex(data_test['publication_date']).year

In [170]:
test = data_test.drop(columns=['title', 'authors', 'isbn', 'isbn13', 'publication_date', 'publisher'], axis=1)
test

In [171]:
pred_d = grad_Ada.predict(test)