In [1]:
import pandas as pd 
import numpy as np 
import nltk
from sklearn.metrics.pairwise import cosine_similarity , linear_kernel
from nltk.tokenize import  word_tokenize
from nltk.stem import PorterStemmer , WordNetLemmatizer
from nltk.corpus import stopwords 
from sklearn.feature_extraction.text import TfidfVectorizer , CountVectorizer
from sentence_transformers import SentenceTransformer
import re
import pickle as pkl





In [2]:
books = pd.read_csv("BooksDataset.csv")

In [3]:
books = books[~books["Description"].isna()]

In [4]:
stop_words = set(stopwords.words("english"))

def clean_data(text):

    if( isinstance(text,str)):

        text = text.lower()

        test = re.sub("\S*@\S*","",text)         # Removing gmail ids

        text= re.sub("http\S*","",text)          # Removing links

        text = re.sub("\d","",text)              # Removing digits

        text = re.sub("\s[a-zA-Z]\s"," ",text)  # Removing single alphabets

        text = re.sub("\s+[^a-zA-Z]+\s+","",text)    #keeping english letters only

        text = re.sub("\[[^+]*\]","",text)     # Removing squate brackets and the content inside it

        text = re.sub("[#-]+","",text)         # Removing hashtags and -  

        text = re.sub("\n","",text)            # Removing newlines 

        text = re.sub("\s\s+"," ",text)         #removing More then one spaces

        text = word_tokenize(text)

        text = [i for i in text if i not in stop_words and len(i) > 2]  #Removing stopwords and two letter words

        return text
    
    else:
        print(type(text))
        return " "




    

In [5]:
books["New_Description"] = books["Description"].apply(clean_data)

In [6]:
books["New_Description"].apply(len).value_counts()

New_Description
18      1590
20      1577
16      1546
21      1534
17      1502
        ... 
1073       1
650        1
470        1
392        1
447        1
Name: count, Length: 441, dtype: int64

In [7]:
books = books[books["New_Description"] != ""]

In [8]:
lemmatizer = WordNetLemmatizer()
def data_stemmer(text):

    
    text = [lemmatizer.lemmatize(i) for i in text]

    return " ".join(text)

In [9]:
books["New_Description"] = books["New_Description"].apply(data_stemmer)

In [10]:
books["New_Description_len"] = books["New_Description"].apply(len)

books["Description_len"] = books["Description"].apply(len)

In [11]:
books.loc[:,["New_Description_len","Description_len"]].head()

Unnamed: 0,New_Description_len,Description_len
7,122,176
8,142,197
10,311,437
11,1990,2724
13,140,244


In [12]:
books = books[ books["New_Description_len"] > books["New_Description_len"].mean() + 150 ]

In [13]:
books["Title"]

11        Germs : Biological Weapons and America's Secre...
14                                All over but the Shoutin'
19                   Hill Rat: Blowing the Lid Off Congress
27                                              Shadow Song
31        Codebreakers' Victory: How the Allied Cryptoga...
                                ...                        
103066                                Africa, Third Edition
103067    A Coup Attempt in Washington: A European Mirro...
103068                                      The Iraq Papers
103069                               Like A Sister: A Novel
103072                 EVA: The Real Key to Creating Wealth
Name: Title, Length: 20812, dtype: object

In [14]:
books.reset_index(drop=True,inplace=True)

## JUST LIKE THIS REPEAT THE PROCESS FOR MOVIES

In [15]:
data1 = pd.read_csv(r"tmdb_5000_credits.csv")
data2 = pd.read_csv(r"tmdb_5000_movies.csv")
data1.columns

Index(['movie_id', 'title', 'cast', 'crew'], dtype='object')

In [16]:
data1.columns = ['id', 'title', 'cast', 'crew']
movies = data2.merge(data1, on = "id")

In [17]:
movies ["new_overview"]= movies["overview"].apply(clean_data)
movies = movies[movies["new_overview"] != " "]

<class 'float'>
<class 'float'>
<class 'float'>


In [18]:
movies["new_overview"] = movies["new_overview"].apply(data_stemmer)

In [19]:
movies["old_len"] = movies["overview"].apply(len)

movies["new_len"] = movies["new_overview"].apply(len)

In [20]:
movies.loc[:,["old_len","new_len"]].head()

Unnamed: 0,old_len,new_len
0,175,123
1,176,111
2,240,181
3,428,338
4,342,242


### Now combine books and movies data to get the whole dataset

In [21]:
books.columns

Index(['Title', 'Authors', 'Description', 'Category', 'Publisher',
       'Publish Date', 'Price', 'New_Description', 'New_Description_len',
       'Description_len'],
      dtype='object')

In [22]:
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title_x', 'vote_average',
       'vote_count', 'title_y', 'cast', 'crew', 'new_overview', 'old_len',
       'new_len'],
      dtype='object')

In [23]:
books.rename(columns= {"New_Description" : "Story" , "Description" : "origionalStory"},inplace=True)

In [24]:
movies.rename(columns= {"original_title" : "Title" , "new_overview" : "Story", "overview" : "origionalStory"},inplace=True)

In [25]:
books = books[["Story",'origionalStory',"Title"]]
books["isBook"] = True

In [26]:
movies = movies[["Story","origionalStory","Title"]]
movies["isBook"] = False

#  False of isBook refers movies

In [27]:
data = pd.concat([books,movies])

In [28]:
data

Unnamed: 0,Story,origionalStory,Title,isBook
0,deadly germ sprayed shopping mall bomblet spew...,"Deadly germs sprayed in shopping malls, bomb-l...",Germs : Biological Weapons and America's Secre...,True
1,new york time notable book yearthis haunting h...,A New York Times Notable Book of the YearThis ...,All over but the Shoutin',True
2,top aide powerful texas congressman ronald col...,As a top aide to powerful Texas Congressman Ro...,Hill Rat: Blowing the Lid Off Congress,True
3,wonderful summer great memory kind love everyb...,"It was a wonderful summer, a great memory, the...",Shadow Song,True
4,first time ever veteran world war cryptographe...,"For the first time ever, veteran World War II ...",Codebreakers' Victory: How the Allied Cryptoga...,True
...,...,...,...,...
4798,mariachi want play guitar carry family traditi...,El Mariachi just wants to play his guitar and ...,El Mariachi,False
4799,newlywed couple honeymoon upended arrival resp...,A newlywed couple's honeymoon is upended by th...,Newlyweds,False
4800,signed sealed delivered introduces dedicated q...,"""Signed, Sealed, Delivered"" introduces a dedic...","Signed, Sealed, Delivered",False
4801,ambitious new york attorney sam sent shanghai ...,When ambitious New York attorney Sam is sent t...,Shanghai Calling,False


In [29]:
data = data[~data["Title"].duplicated()]

In [30]:
data.reset_index(inplace=True,drop=True)

In [31]:
data["Title"]

0        Germs : Biological Weapons and America's Secre...
1                                All over but the Shoutin'
2                   Hill Rat: Blowing the Lid Off Congress
3                                              Shadow Song
4        Codebreakers' Victory: How the Allied Cryptoga...
                               ...                        
24575                                          El Mariachi
24576                                            Newlyweds
24577                            Signed, Sealed, Delivered
24578                                     Shanghai Calling
24579                                    My Date with Drew
Name: Title, Length: 24580, dtype: object

In [32]:
TFIDS = TfidfVectorizer(stop_words='english')

tfids = TFIDS.fit_transform(data["Story"])
tfids

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2194827 stored elements and shape (24580, 129370)>

In [33]:
cosine_sim = linear_kernel(tfids,tfids)

In [34]:
len(data)

24580

In [35]:
len(cosine_sim)

24580

In [36]:
len(cosine_sim[0])

24580

In [37]:
cosine_sim[0][1:].argmax()

2486

In [38]:
reverse_index = pd.DataFrame( data.index )

In [39]:
reverse_index.index = data["Title"]

In [40]:
reverse_index.head()

Unnamed: 0_level_0,0
Title,Unnamed: 1_level_1
Germs : Biological Weapons and America's Secret War,0
All over but the Shoutin',1
Hill Rat: Blowing the Lid Off Congress,2
Shadow Song,3
Codebreakers' Victory: How the Allied Cryptogaphers Won World War II,4


In [41]:
data.head()

Unnamed: 0,Story,origionalStory,Title,isBook
0,deadly germ sprayed shopping mall bomblet spew...,"Deadly germs sprayed in shopping malls, bomb-l...",Germs : Biological Weapons and America's Secre...,True
1,new york time notable book yearthis haunting h...,A New York Times Notable Book of the YearThis ...,All over but the Shoutin',True
2,top aide powerful texas congressman ronald col...,As a top aide to powerful Texas Congressman Ro...,Hill Rat: Blowing the Lid Off Congress,True
3,wonderful summer great memory kind love everyb...,"It was a wonderful summer, a great memory, the...",Shadow Song,True
4,first time ever veteran world war cryptographe...,"For the first time ever, veteran World War II ...",Codebreakers' Victory: How the Allied Cryptoga...,True


In [42]:
def getRecommendatioFromTitle(title , cos_sim = cosine_sim ,includes = "both" , top = 10):

    index = reverse_index.loc[title].values[0]

    get_similar = list(enumerate(cos_sim[index]))

    get_similar = sorted(get_similar ,key = lambda x : x[1], reverse=True)

    get_similar = get_similar


    indices =  [i for i,j in get_similar]

    recommendations = data.iloc[indices]

    if(includes == "books"):    

        recommendations =  recommendations[recommendations["isBook"]]

    elif(includes == "movies"):

        recommendations =  recommendations[~recommendations["isBook"]]

    recommendations.reset_index(inplace = True,drop=True)



    return recommendations[["Title","isBook","origionalStory"]].iloc[0:top].reset_index(drop=True)


In [43]:
def getRecommendatioFromStory(text , includes = "both" , top = 10):
    
    text = clean_data(text)
    text = [data_stemmer(text)]
    encoded_text = TFIDS.transform(text)
    
    curr_cossim = linear_kernel(encoded_text,tfids)
    
    get_similar = list(enumerate(curr_cossim[0]))

    get_similar = sorted(get_similar ,key = lambda x : x[1], reverse=True)

    get_similar = get_similar


    indices =  [i for i,j in get_similar]

    recommendations = data.iloc[indices]

    if(includes == "books"):    

        recommendations =  recommendations[recommendations["isBook"]]

    elif(includes == "movies"):

        recommendations =  recommendations[~recommendations["isBook"]]

    recommendations.reset_index(inplace = True,drop=True)



    return recommendations[["Title","isBook","origionalStory"]].iloc[0:top].reset_index(drop=True)


In [44]:
# allData["cosine_sim"] = cosine_sim
# allData["Data"] = data
# allData["reverse_index"] = reverse_index
# allData["TFIDS"] = TFIDS
# allData["EncodedStories"] = tfids 
# pkl.dump(allData , open("DataDump.pkl","wb"))

In [45]:
getRecommendatioFromTitle("The Godfather: Part III")


Unnamed: 0,Title,isBook,origionalStory
0,The Godfather: Part III,False,In the midst of trying to legitimize his busin...
1,The Godfather: Part II,False,In the continuing saga of the Corleone crime f...
2,The Godfather Returns,True,THE MISSING YEARS FROM THE GREATEST CRIME SAGA...
3,The Godfather Returns: A Novel,True,THE MISSING YEARS FROM THE GREATEST CRIME SAGA...
4,The Godfather,False,"Spanning the years 1945 to 1955, a chronicle o..."
5,Police Academy: Mission to Moscow,False,The Russians need help in dealing with the Maf...
6,We Own the Night,False,A New York nightclub manager tries to save his...
7,Aging: A Natural History (Scientific American ...,True,"The process of aging is familiar to, and usual..."
8,Dawn Patrol,False,After the brutal murder of his beloved brother...
9,"You, Staying Young: The Owner's Manual for Ext...",True,Drs. Oz and Roizen—the bestselling coauthors o...


In [46]:
getRecommendatioFromTitle("Dawn Patrol")

Unnamed: 0,Title,isBook,origionalStory
0,Dawn Patrol,False,After the brutal murder of his beloved brother...
1,Chairman of the Board,False,A surfer becomes the head of a major company.
2,Dead Man's Shoes,False,A soldier returns home to his small town and e...
3,Porky's,False,"Set in 1954, a group of Florida high schoolers..."
4,The Sting,False,Set in the 1930's this intricate caper deals w...
5,친절한 금자씨,False,After a 13-year imprisonment for the kidnap an...
6,Meet the Deedles,False,Two surfers end up as Yellowstone park rangers...
7,Furious 7,False,Deckard Shaw seeks revenge against Dominic Tor...
8,West Side Story,False,In the slums of the upper West Side of Manhatt...
9,Torque,False,Biker Cary Ford is framed by an old rival and ...


In [47]:
story = input("Enter the few lines of story you like : ")

In [48]:
story

'A man who nerver lived an peaseful life cant got place into the hevan'

In [49]:
getRecommendatioFromStory(story,includes="both")

Unnamed: 0,Title,isBook,origionalStory
0,"Drop Dead, My Lovely",True,"Meet Pete Ingalls, a tough-talking, hard-boile..."
1,Room,False,Jack is a young boy of 5 years old who has liv...
2,A Quiet Place,True,"""Sometimes a person needs a quiet place."" A p..."
3,The Way of the Wild Heart: A Map for the Mascu...,True,This is a book about how a boy?and a man?becom...
4,Snow Dogs,False,When a Miami dentist inherits a team of sled d...
5,Human Traffic,False,"All that exists now is clubs, drugs, pubs and ..."
6,Explorers Who Got Lost,True,During the fifteenth and sixteenth centuries j...
7,The Present: The Gift That Makes You Happier a...,True,Another Spencer Johnson #1 Bestseller#1 New Yo...
8,The Lost Lore of a Man's Life: Lots of Cool St...,True,"To restore men's rightful heritage, Denis Boyl..."
9,Elsewhere: A Novel,True,Is it possible to grow up while getting younge...
