In [3]:
import os
import pandas as pd
import numpy as np
import ast


In [4]:
os.chdir(os.getcwd().replace('notebooks', 'data'))

movies = pd.read_csv('movies_cleaned.csv')


In [5]:
movies.head()


Unnamed: 0,title,poster_path,tags
0,The Godfather,https://image.tmdb.org/t/p/original//3bhkrj58V...,"['Spanning', 'the', 'years', '1945', 'to', '19..."
1,The Shawshank Redemption,https://image.tmdb.org/t/p/original//lyQBXzOQS...,"['Framed', 'in', 'the', '1940s', 'for', 'the',..."
2,The Godfather Part II,https://image.tmdb.org/t/p/original//bMadFzhjy...,"['In', 'the', 'continuing', 'saga', 'of', 'the..."
3,Schindler's List,https://image.tmdb.org/t/p/original//sF1U4EUQS...,"['The', 'true', 'story', 'of', 'how', 'busines..."
4,Dilwale Dulhania Le Jayenge,https://image.tmdb.org/t/p/original//ktejodbcd...,"['Raj', 'is', 'a', 'rich,', 'carefree,', 'happ..."


In [6]:
corups = movies['tags'].values


In [7]:
corups.shape


(9997,)

In [8]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()


def convert_tags(tags):
    return " ".join([stemmer.stem(tag) for tag in ast.literal_eval(tags)]).lower()


In [9]:
convert_tags(corups[0])


'span the year 1945 to 1955, a chronicl of the fiction italian-american corleon crime family. when organ crime famili patriarch, vito corleon bare surviv an attempt on hi life, hi youngest son, michael step in to take care of the would-b killers, launch a campaign of bloodi revenge. drama crime itali lossoflovedon loveatfirstsight basedonnovelorbook europ symbol patriarch organizedcrim mafia lawyer religion revengemot crimefamili sicilianmafia religioushypocrisi gunviol risetopow deadhors gangviol 1940 1950 mafiawar marlonbrando alpacino jamescaan francisfordcoppola'

In [10]:
corups = [convert_tags(tag) for tag in corups]


In [11]:
corups[:4]


['span the year 1945 to 1955, a chronicl of the fiction italian-american corleon crime family. when organ crime famili patriarch, vito corleon bare surviv an attempt on hi life, hi youngest son, michael step in to take care of the would-b killers, launch a campaign of bloodi revenge. drama crime itali lossoflovedon loveatfirstsight basedonnovelorbook europ symbol patriarch organizedcrim mafia lawyer religion revengemot crimefamili sicilianmafia religioushypocrisi gunviol risetopow deadhors gangviol 1940 1950 mafiawar marlonbrando alpacino jamescaan francisfordcoppola',
 'frame in the 1940 for the doubl murder of hi wife and her lover, upstand banker andi dufresn begin a new life at the shawshank prison, where he put hi account skill to work for an amor warden. dure hi long stretch in prison, dufresn come to be admir by the other inmat -- includ an older prison name red -- for hi integr and unquench sens of hope. drama crime prison corrupt policebrut basedonnovelorbook prisoncel delinqu

In [12]:
from sklearn.feature_extraction.text import CountVectorizer


In [13]:
vectorizer = CountVectorizer(max_features=8000, stop_words='english')


In [14]:
vectors = vectorizer.fit_transform(corups)


In [15]:
vectors[0]


<1x8000 sparse matrix of type '<class 'numpy.int64'>'
	with 46 stored elements in Compressed Sparse Row format>

In [16]:
vectorizer.get_feature_names_out()[:100]


array(['000', '10', '100', '11', '12', '12thcenturi', '13', '14',
       '14thcenturi', '15', '15thcenturi', '16', '16th', '16thcenturi',
       '17', '17th', '17thcenturi', '18', '18th', '18thcenturi', '19',
       '1910', '1920', '1920s', '1930', '1930s', '1936', '1937', '1940',
       '1941', '1942', '1943', '1944', '1945', '1950', '1950s', '1959',
       '1960', '1960s', '1962', '1963', '1964', '1965', '1967', '1969',
       '1970', '1970s', '1971', '1973', '1976', '1978', '1979', '1980',
       '1980s', '1984', '1986', '1987', '1988', '1990', '1992', '1999',
       '19th', '19thcenturi', '1stcenturi', '20', '2000', '2001', '2008',
       '2009', '2010', '2016', '2020', '2030', '2040', '20th', '21',
       '21st', '22', '23', '24', '25', '27', '28', '30', '300', '35',
       '39', '40', '45', '50', '500', '60', '70', '80', '95', 'aamirkhan',
       'aaron', 'aaroneckhart', 'aaronpaul', 'aarontaylor'], dtype=object)

In [17]:
from sklearn.metrics.pairwise import cosine_similarity


In [18]:
movie_similarities = cosine_similarity(vectors[1], vectors)
movie_similarities


array([[0.24175915, 1.        , 0.07311503, ..., 0.06200991, 0.0442374 ,
        0.06021717]])

In [19]:
def recomend(movie, k = 30):
  index = movies[movies['title'] == movie].index[0]
  movie_similarities = cosine_similarity(vectors[index], vectors)[0]

  movies_list = sorted(list(enumerate(movie_similarities)), reverse=True, key=lambda x: x[1])[1:k + 1]
  return [movies.iloc[movie[0]]['title'] for movie in movies_list]


In [20]:
recomend('Avatar')


['Skylines',
 'Battle: Los Angeles',
 'Rim of the World',
 'Aliens',
 'Titan A.E.',
 'Final Fantasy: The Spirits Within',
 'Attraction',
 'Falcon Rising',
 'Avatar: The Way of Water',
 'Valerian and the City of a Thousand Planets',
 'Stargate: Continuum',
 'Independence Day',
 'Predators',
 'Humanité déchue',
 'Aliens in the Attic',
 'Trollhunters: Rise of the Titans',
 'Dragon Ball Z: Bardock - The Father of Goku',
 'Mission to Mars',
 'Parasyte: Part 1',
 'Dark Star',
 'Small Soldiers',
 'The Cabbage Soup',
 'Explorers',
 'Enemy Mine',
 'The 5th Wave',
 'Edge of Tomorrow',
 'Lightyear',
 'Starman',
 'Encounter',
 'Extinction']

In [21]:
import joblib


In [23]:
joblib.dump(movies['poster_path'].values, 'moview_posters.joblib')


['moview_posters.joblib']

In [24]:
joblib.dump(vectors, 'moview_vectors.joblib')


['moview_vectors.joblib']

In [25]:
joblib.dump(movies['title'].values, 'moview_title.joblib')


['moview_title.joblib']