In [1]:
import gc
gc.collect()
print("Data Cleared")

Data Cleared


In [2]:
import sys
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import random

from sklearn.preprocessing import MinMaxScaler

import implicit


In [3]:
# Load the data like we did before
raw_data = pd.read_csv('.\\Orginial Dataset\\BOOKSPURCHHISTORY.csv')
data = raw_data.drop(['TIMESTAMP','SUBSTATE','WeekofYear'],axis=1)
data['Purch'] = 1 
# Drop NaN columns
data = data.dropna()

In [4]:
# The implicit library expects data as a item-user matrix so we
# create two matricies, one for fitting the model (item-user) 
# and one for recommendations (user-item)
sparse_item_user = sparse.csr_matrix((data['Purch'].astype(float), (data['BookID'], data['UserID'])))
sparse_user_item = sparse.csr_matrix((data['Purch'].astype(float), (data['UserID'], data['BookID'])))

# Initialize the als model and fit it using the sparse item-user matrix
model = implicit.als.AlternatingLeastSquares(factors=30, regularization=0.15, iterations=30)

# Calculate the confidence by multiplying it by our alpha value.
alpha_val = 20
data_conf = (sparse_item_user * alpha_val).astype('double')

#Fit the model
model.fit(data_conf)


100%|████████████████████████████████████████████████████████████████████████████████| 30.0/30 [00:02<00:00, 11.52it/s]


In [5]:
#---------------------
# FIND SIMILAR ITEMS
#---------------------

# Find the 10 most similar to Jay-Z
item_id = 140361 #Jay-Z
n_similar = 10

# Use implicit to get similar items.
similar = model.similar_items(item_id, n_similar)

# Print the names of our most similar artists
for item in similar:
    idx, score = item
    print(data.BookID.loc[data.BookID == idx].iloc[0])

140361
153572
120661
117578
130472
161217
154977
139514
156058
103176


In [6]:
data.head()

Unnamed: 0,BookID,UserID,Purch
0,140361,608502,1
1,118603,673204,1
2,170523,641650,1
3,111924,619531,1
4,178056,628661,1


In [6]:
#------------------------------
# CREATE USER RECOMMENDATIONS
#------------------------------

# Create recommendations for user with id 2025
user_id = 608502 #-----

# Use the implicit recommender.
recommended = model.recommend(user_id, sparse_user_item)

books = []
scores = []

# Get artist names from ids
for item in recommended:
    idx, score = item
    books.append(data.BookID.loc[data.BookID == idx].iloc[0])
    scores.append(score)
#'UserID':user_id,
# Create a dataframe of artist names and scores
recommendations = pd.DataFrame({'BookID': books, 'score': scores})

print(recommendations)

   BookID     score
0  104483  0.297517
1  125941  0.277425
2  159750  0.274089
3  163052  0.254589
4  105481  0.223557
5  132893  0.207416
6  160150  0.192446
7  153572  0.191132
8  106808  0.186745
9  104396  0.183129


In [7]:
#------------------------------
# CREATE USER RECOMMENDATIONS
#------------------------------

# Create recommendations for user with id 2025
#user_id = 608502


user = []
books = []
scores = []
i=0
for user_id in (data.UserID.unique()):
    # Use the implicit recommender.
    recommended = model.recommend(user_id, sparse_user_item,30)
       
    # Get artist names from ids
    for item in recommended:
        idx, score = item
        user.append(user_id)
        books.append(idx)
        scores.append(score)
        i+=1
        if(i%1000 ==0):
            print(i)
        
# Create a dataframe of artist names and scores
recommendations = pd.DataFrame({'UserID': user,'BookID': books, 'score': scores})

#print(recommendations)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000
100000
101000
102000
103000
104000
105000
106000
107000
108000
109000
110000
111000
112000
113000
114000
115000
116000
117000
118000
119000
120000
121000
122000
123000
124000
125000
126000
127000
128000
129000
130000
131000
132000
133000
134000
135000
136000
137000
138000
139000
140000
141000
142000
143000
144000
145000
146000
147000
148000
149000
150000
151000
152000
153000
154000
155000
156000
157000
158000
15

In [8]:
recommendations.columns

Index(['UserID', 'BookID', 'score'], dtype='object')

In [9]:
data.columns = ['BookID','UserID','score']

In [10]:
Final_Data = pd.concat([data,recommendations],axis =0).reset_index(drop = True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [11]:
del data
del recommendations
del raw_data
del i

In [12]:
Final_Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 369382 entries, 0 to 369381
Data columns (total 3 columns):
BookID    369382 non-null int64
UserID    369382 non-null int64
score     369382 non-null float64
dtypes: float64(1), int64(2)
memory usage: 8.5 MB


In [None]:
## Collecting smiliarty between Datasets with cosine similarity

In [13]:
# Load the data like we did before
Train = pd.read_csv('.\\Orginial Dataset\\BOOKSMASTERTRAIN.csv')
Test = pd.read_csv('.\\Orginial Dataset\\BOOKSMASTERTEST.csv')
Catalogue = pd.read_csv('.\\Orginial Dataset\\BOOKSCATALOGUE.csv')
Train['DataSet'] = 'Train'
Test['DataSet']='Test'
Book_Data = pd.concat([Train,Test],axis=0).reset_index(drop=True)
del Train
del Test

In [14]:
Book_Data = Book_Data.drop(['BOOKTITLE','BOOKNAME','SUMMARY','GENRE.1','AUTHORDESC', 'COVERPAGE',],axis =1)
import re
import string
import nltk

cleanup_re = re.compile('[^a-z0-9,]+')

def cleanup(sentence):
    sentence = str(sentence)
    sentence = sentence.lower()
    sentence = cleanup_re.sub(' ', sentence).strip()
    #sentence = " ".join(nltk.word_tokenize(sentence))
    return sentence

def Type(x):
    result = re.search('(^\w+)', x)
    return result.group(1)

Types_ = ['paperback','hardcover','mass','kindle','audio','published','ebook','audiobook','board'] 
def Types(x):
    if x in Types_:
        return x
    else:
        return 'Unknown'

def published(x):
    result = re.search('published(.*)by', x)
    if result:
        return result.group(1)    
    
def pages(x):
    result = re.search('(\d.*)pages', x)
    if result:
        return result.group(1)

def first_published(x):
    result = re.search('first published(.*)', x)
    if result:
        return result.group(1)
        

def language(x):
    if re.search('english',x):
        return 'English'
    else:
        return 'Other'
    
    
Book_Data['OTHERPRINTEDINFO'] = Book_Data['OTHERPRINTEDINFO'].apply(lambda x: cleanup(x))

Book_Data['Type'] = Book_Data['OTHERPRINTEDINFO'].apply(lambda x: Type(x))
Book_Data['Type'] = Book_Data['Type'].apply(lambda x: Types(x))

Book_Data['Published_On'] = Book_Data['OTHERPRINTEDINFO'].apply(lambda x: published(x))
Book_Data['Page_Count'] = Book_Data['OTHERPRINTEDINFO'].apply(lambda x: pages(x))

Book_Data['First_Published'] = Book_Data['OTHERPRINTEDINFO'].apply(lambda x: first_published(x))

Book_Data['DETAILS'] = Book_Data['DETAILS'].apply(lambda x: cleanup(x))

Book_Data['Language'] =Book_Data['DETAILS'].apply(lambda x: language(x))
#Spanish


def four_char(x):
    if x:
        x=x.strip()
        y= x[-4:].strip()
        return y


Book_Data['Page_Count1'] = Book_Data['Page_Count'].apply(lambda x : four_char(x))
Book_Data['Published_On1'] = Book_Data['Published_On'].apply(lambda x : four_char(x))
Book_Data['First_Published1'] = Book_Data['First_Published'].apply(lambda x : four_char(x))



Book_Data['Page_Count'] = Book_Data['Page_Count'].apply(pd.to_numeric, errors='coerce')
Book_Data['Published_On'] = Book_Data['Published_On1'].apply(pd.to_numeric, errors='coerce')
Book_Data['First_Published'] = Book_Data['First_Published1'].apply(pd.to_numeric, errors='coerce')

Book_Data = Book_Data.drop(['Page_Count1','Published_On1','First_Published1'],axis=1)



In [15]:
Book_Data = Book_Data.drop(['SERIES','AUTHOR','OTHERPRINTEDINFO','DETAILS'],axis =1)

In [16]:
Book_Data.head()

Unnamed: 0,BookID,GENRE,USERRATINGS,Popularity,DataSet,Type,Published_On,Page_Count,First_Published,Language
0,114530.0,Fiction,3.87,15417.968518,Train,paperback,2004.0,226.0,2003.0,English
1,133131.0,Science Fiction,3.76,218.146718,Train,paperback,2007.0,381.0,2006.0,English
2,153927.0,Nonfiction,3.93,64.196614,Train,paperback,2003.0,320.0,2002.0,English
3,160262.0,History,4.03,77.324027,Train,hardcover,2006.0,327.0,2006.0,English
4,133451.0,Mystery,4.12,10.45441,Train,paperback,2006.0,384.0,2006.0,English


In [18]:
#New_Data['GENRE'].value_counts()
Genre_wide = pd.get_dummies(Book_Data['GENRE'])
Book_Data = pd.concat([Book_Data, Genre_wide], axis=1).drop(['GENRE'],axis=1) 
del Genre_wide

Type_wide = pd.get_dummies(Book_Data['Type'])
Book_Data = pd.concat([Book_Data, Type_wide], axis=1).drop(['Type'],axis=1) 
del Type_wide

In [19]:
Catalogue.head()

Unnamed: 0,BookID,STATE,SUBSTATE
0,174285,New Mexico,New Mexico
1,176660,Oregon,Oregon
2,140353,Arizona,Arizona
3,178378,New York,New York
4,128711,California,Orange


In [20]:
CatWide = (Catalogue['SUBSTATE'].str.split(',')
                    .groupby(Catalogue['BookID'])
                    .agg(lambda x: ', '.join(set(y for z in x for y in z)))
                    .reset_index())

CatWide_one = CatWide.SUBSTATE.str.get_dummies(sep=",")
CatWide = pd.concat([CatWide,CatWide_one],axis =1).drop('SUBSTATE',axis=1)
del CatWide_one


CatWide1 = (Catalogue['STATE'].str.split(',')
                    .groupby(Catalogue['BookID'])
                    .agg(lambda x: ', '.join(set(y for z in x for y in z)))
                    .reset_index())

CatWide_Two = CatWide1.STATE.str.get_dummies(sep=",")
CatWide = pd.concat([CatWide,CatWide_Two],axis =1)#.drop('STATE',axis=1)
del CatWide_Two




In [21]:
Book_Data = pd.merge(Book_Data,CatWide,left_on='BookID',right_on='BookID',how='left')

In [22]:
Book_Data.head()

Unnamed: 0,BookID,USERRATINGS,Popularity,DataSet,Published_On,Page_Count,First_Published,Language,Biography,Childrens > Picture Books,...,Pennsylvania,Rhode Island,South Dakota,Tennessee,Texas,Utah,Virginia,Washington,West Virginia,Wisconsin
0,114530.0,3.87,15417.968518,Train,2004.0,226.0,2003.0,English,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,133131.0,3.76,218.146718,Train,2007.0,381.0,2006.0,English,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,153927.0,3.93,64.196614,Train,2003.0,320.0,2002.0,English,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,160262.0,4.03,77.324027,Train,2006.0,327.0,2006.0,English,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,133451.0,4.12,10.45441,Train,2006.0,384.0,2006.0,English,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
#from sklearn import preprocessing
# Create x, where x the 'scores' column's values as floats
x = Book_Data[['Popularity']].values.astype(float)

# Create a minimum and maximum processor object
min_max_scaler = MinMaxScaler()

# Create an object to transform the data to fit minmax processor
x_scaled = min_max_scaler.fit_transform(x)

# Run the normalizer on the dataframe
df_Popularity = pd.DataFrame(x_scaled,columns=['Popularity'])

In [46]:
#from sklearn import preprocessing
# Create x, where x the 'scores' column's values as floats
x = Book_Data[['USERRATINGS']].values.astype(float)

# Create a minimum and maximum processor object
min_max_scaler = MinMaxScaler()

# Create an object to transform the data to fit minmax processor
x_scaled = min_max_scaler.fit_transform(x)

# Run the normalizer on the dataframe
df_USERRATINGS = pd.DataFrame(x_scaled,columns=['USERRATINGS'])


In [47]:
#from sklearn import preprocessing
# Create x, where x the 'scores' column's values as floats
x = Book_Data[['Published_On']].values.astype(float)

# Create a minimum and maximum processor object
min_max_scaler = MinMaxScaler()

# Create an object to transform the data to fit minmax processor
x_scaled = min_max_scaler.fit_transform(x)

# Run the normalizer on the dataframe
df_Published_On = pd.DataFrame(x_scaled,columns=['Published_On'])

In [48]:
#from sklearn import preprocessing
# Create x, where x the 'scores' column's values as floats
x = Book_Data[['First_Published']].values.astype(float)

# Create a minimum and maximum processor object
min_max_scaler = MinMaxScaler()

# Create an object to transform the data to fit minmax processor
x_scaled = min_max_scaler.fit_transform(x)

# Run the normalizer on the dataframe
df_First_Published = pd.DataFrame(x_scaled,columns=['First_Published'])

In [50]:
#from sklearn import preprocessing
# Create x, where x the 'scores' column's values as floats
x = Book_Data[['Page_Count']].values.astype(float)

# Create a minimum and maximum processor object
min_max_scaler = MinMaxScaler()

# Create an object to transform the data to fit minmax processor
x_scaled = min_max_scaler.fit_transform(x)

# Run the normalizer on the dataframe
df_Page_Count = pd.DataFrame(x_scaled,columns=['Page_Count'])

In [51]:
Book_Data = Book_Data.drop(['Popularity','USERRATINGS','Published_On','First_Published','Page_Count',],axis=1)

In [57]:
Book_Data = pd.concat([Book_Data,df_USERRATINGS,df_Popularity,df_First_Published,df_Published_On,df_Page_Count,],axis=1)

In [59]:
Lan_wide = pd.get_dummies(Book_Data['Language'])
Book_Data = pd.concat([Book_Data, Lan_wide], axis=1).drop(['Language'],axis=1) 
del Lan_wide
Book_Data.head()

Unnamed: 0,BookID,DataSet,Biography,Childrens > Picture Books,Classics,Fantasy,Fiction,Historical > Historical Fiction,History,Horror,...,Washington,West Virginia,Wisconsin,USERRATINGS,Popularity,First_Published,Published_On,Page_Count,English,Other
0,114530.0,Train,0,0,0,0,1,0,0,0,...,0.0,0.0,1.0,0.694595,0.411209,0.992244,0.893617,0.055122,1,0
1,133131.0,Train,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.664865,0.005818,0.993795,0.914894,0.092927,1,0
2,153927.0,Train,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.710811,0.001712,0.991727,0.886525,0.078049,1,0
3,160262.0,Train,0,0,0,0,0,0,1,0,...,0.0,0.0,1.0,0.737838,0.002062,0.993795,0.907801,0.079756,1,0
4,133451.0,Train,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.762162,0.000278,0.993795,0.907801,0.093659,1,0


In [60]:
PTrain = Book_Data[Book_Data.DataSet == "Train"].fillna(0)
PTrain=PTrain.drop(['DataSet'],axis =1)
X = PTrain['BookID']
PTrain = PTrain.drop(['BookID'],axis =1)

In [61]:
PTest = Book_Data[Book_Data.DataSet == "Test"].fillna(0)
PTest=PTest.drop(['DataSet'],axis =1).reset_index(drop=True)
Y = PTest['BookID']
PTest = PTest.drop(['BookID'],axis =1)

In [62]:
from sklearn.metrics.pairwise import cosine_similarity
L = cosine_similarity(PTest,PTrain)

Matrix = pd.DataFrame(L)

# Change the column names 
Matrix.columns =X.values
  
# Change the row indexes 
Matrix.index = Y.values

##################################################

Result =[]
for i in range(len(Matrix.columns)):
    Result.append(Matrix.iloc[:, [i]].idxmax().values)

N =pd.DataFrame(Result)
X = pd.DataFrame(X)
Final_Result = pd.concat([X,N],axis =1)
Final_Result.columns = ['BookID','PURCHASEDBOOKID']

In [69]:
Matrix.head()

Unnamed: 0,114530.0,133131.0,153927.0,160262.0,133451.0,174752.0,121628.0,138288.0,143354.0,140053.0,...,109750.0,133713.0,141049.0,133739.0,124083.0,129858.0,119832.0,129762.0,106217.0,120717.0
109615.0,0.288366,0.29207,0.29169,0.276513,0.29278,0.239931,0.262769,0.168438,0.244109,0.224955,...,0.344542,0.310854,0.356863,0.450156,0.223892,0.291725,0.311567,0.169,0.142735,0.40254
148159.0,0.164395,0.216901,0.311834,0.334665,0.220764,0.132244,0.119611,0.24028,0.246244,0.410876,...,0.168398,0.146034,0.222835,0.171163,0.10439,0.312394,0.220995,0.241136,0.20196,0.189457
117280.0,0.304656,0.211809,0.211554,0.298665,0.213103,0.156153,0.192653,0.087923,0.162482,0.1482,...,0.276226,0.332884,0.191647,0.347936,0.25003,0.211706,0.358954,0.088498,0.071474,0.388111
164937.0,0.455595,0.314446,0.241158,0.153196,0.316711,0.184589,0.288437,0.190485,0.195235,0.246589,...,0.309072,0.29647,0.254937,0.415079,0.475756,0.24169,0.363748,0.190239,0.158819,0.322251
172865.0,0.114851,0.283963,0.646507,0.255623,0.288773,0.304858,0.263358,0.312809,0.44644,0.530941,...,0.223414,0.128851,0.289214,0.170593,0.150736,0.647605,0.075259,0.313723,0.263005,0.132091


In [63]:
Final_Data.head()

Unnamed: 0,BookID,UserID,score
0,140361,608502,1.0
1,118603,673204,1.0
2,170523,641650,1.0
3,111924,619531,1.0
4,178056,628661,1.0


In [64]:
Final_Data['BookID'] = Final_Data['BookID'].map(Final_Result.set_index('BookID')['PURCHASEDBOOKID'])

In [68]:
Final_Data.shape

(369382, 3)

In [576]:
Solution = Final_Data.sort_values('score', ascending=False).drop_duplicates(['UserID','BookID']).sort_index()
#Final_Data = Final_Data.drop_duplicates(["BookID","UserID"])

In [66]:
Solution.shape

NameError: name 'Solution' is not defined

In [578]:
Solution = Solution.sort_values('score',ascending = False).groupby('UserID').head(10)
Solution.shape

(108770, 3)

In [579]:
Solution = Solution.reset_index(drop = True)

In [582]:
Solution.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108770 entries, 0 to 108769
Data columns (total 2 columns):
BookID    108770 non-null float64
UserID    108770 non-null int64
dtypes: float64(1), int64(1)
memory usage: 1.7 MB


In [581]:
Solution = Solution.drop(['score'],axis=1)

In [583]:
Solution.columns = ['PURCHASEDBOOKID','USERID']

In [584]:
Final_Output = pd.DataFrame(Solution.groupby(['USERID'])['PURCHASEDBOOKID'].apply(list))

In [585]:
Final_Output.to_csv('C:\\Users\Manu\\Documents\\Untitled Folder\\final_output.csv')

In [586]:
Final_Output.shape

(10877, 1)