# Preprocessing

In [66]:
import pandas as pd
import numpy as np

In [67]:
# Uploading dataset
movies = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb_5000_credits.csv")

In [68]:
# taking important features
imp_features = ["title", "genres", "keywords", "overview", "production_companies"]
movies = movies[imp_features]

In [69]:
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [70]:
movies = movies.merge(credits, on="title")
movies.head(1)

Unnamed: 0,title,genres,keywords,overview,production_companies,movie_id,cast,crew
0,Avatar,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di...","[{""name"": ""Ingenious Film Partners"", ""id"": 289...",19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [71]:
movies["overview"] = movies["overview"].replace(np.nan, "")

In [72]:
movies.genres[0]

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [73]:
import ast

def convert(l):
    l = ast.literal_eval(l)
    ans = []
    for i in l:
        ans.append(i["name"])
    return ans

In [74]:
movies.genres = movies.genres.apply(convert)

In [75]:
movies.keywords = movies.keywords.apply(convert)

In [76]:
movies.overview = movies.overview.apply(lambda x: x.split(" "))

In [77]:
movies.production_companies = movies.production_companies.apply(convert)

In [78]:
def convert_cast(l):
    l = ast.literal_eval(l)
    count = 0
    ans = []
    for i in l:
        if count == 5:
            return ans
        ans.append(i["name"])
        count += 1
    return ans

In [79]:
movies.cast = movies.cast.apply(convert_cast)

In [80]:
def convert_crew(l):
    l = ast.literal_eval(l)
    for i in l:
        if i["job"] == "Director":
            return i["name"]
    return ""

In [81]:
movies.crew = movies.crew.apply(convert_crew)

In [82]:
movies.head()

Unnamed: 0,title,genres,keywords,overview,production_companies,movie_id,cast,crew
0,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[In, the, 22nd, century,, a, paraplegic, Marin...","[Ingenious Film Partners, Twentieth Century Fo...",19995,"[Sam Worthington, Zoe Saldana, Sigourney Weave...",James Cameron
1,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Captain, Barbossa,, long, believed, to, be, d...","[Walt Disney Pictures, Jerry Bruckheimer Films...",285,"[Johnny Depp, Orlando Bloom, Keira Knightley, ...",Gore Verbinski
2,Spectre,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[A, cryptic, message, from, Bond’s, past, send...","[Columbia Pictures, Danjaq, B24]",206647,"[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",Sam Mendes
3,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Following, the, death, of, District, Attorney...","[Legendary Pictures, Warner Bros., DC Entertai...",49026,"[Christian Bale, Michael Caine, Gary Oldman, A...",Christopher Nolan
4,John Carter,"[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[John, Carter, is, a, war-weary,, former, mili...",[Walt Disney Pictures],49529,"[Taylor Kitsch, Lynn Collins, Samantha Morton,...",Andrew Stanton


In [83]:
for i in movies.columns:
    if i in ["title", "overview", "movie_id"]:
        continue
    if i == "crew":
        movies[i] = movies[i].apply(lambda x: x.replace(" ", ""))
        continue
    movies[i] = movies[i].apply(lambda x: [t.replace(" ", "") for t in x])
    
movies.head() 

Unnamed: 0,title,genres,keywords,overview,production_companies,movie_id,cast,crew
0,Avatar,"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[In, the, 22nd, century,, a, paraplegic, Marin...","[IngeniousFilmPartners, TwentiethCenturyFoxFil...",19995,"[SamWorthington, ZoeSaldana, SigourneyWeaver, ...",JamesCameron
1,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[Captain, Barbossa,, long, believed, to, be, d...","[WaltDisneyPictures, JerryBruckheimerFilms, Se...",285,"[JohnnyDepp, OrlandoBloom, KeiraKnightley, Ste...",GoreVerbinski
2,Spectre,"[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[A, cryptic, message, from, Bond’s, past, send...","[ColumbiaPictures, Danjaq, B24]",206647,"[DanielCraig, ChristophWaltz, LéaSeydoux, Ralp...",SamMendes
3,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[Following, the, death, of, District, Attorney...","[LegendaryPictures, WarnerBros., DCEntertainme...",49026,"[ChristianBale, MichaelCaine, GaryOldman, Anne...",ChristopherNolan
4,John Carter,"[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[John, Carter, is, a, war-weary,, former, mili...",[WaltDisneyPictures],49529,"[TaylorKitsch, LynnCollins, SamanthaMorton, Wi...",AndrewStanton


In [84]:
movies.crew = movies.crew.apply(lambda x: [x])

In [85]:
movies["tags"] = movies.genres + movies.keywords + movies.overview + movies.cast + movies.crew + movies.production_companies
movies.head(1)

Unnamed: 0,title,genres,keywords,overview,production_companies,movie_id,cast,crew,tags
0,Avatar,"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[In, the, 22nd, century,, a, paraplegic, Marin...","[IngeniousFilmPartners, TwentiethCenturyFoxFil...",19995,"[SamWorthington, ZoeSaldana, SigourneyWeaver, ...",[JamesCameron],"[Action, Adventure, Fantasy, ScienceFiction, c..."


In [86]:
data = movies[["title", "movie_id", "tags"]].copy()
data

Unnamed: 0,title,movie_id,tags
0,Avatar,19995,"[Action, Adventure, Fantasy, ScienceFiction, c..."
1,Pirates of the Caribbean: At World's End,285,"[Adventure, Fantasy, Action, ocean, drugabuse,..."
2,Spectre,206647,"[Action, Adventure, Crime, spy, basedonnovel, ..."
3,The Dark Knight Rises,49026,"[Action, Crime, Drama, Thriller, dccomics, cri..."
4,John Carter,49529,"[Action, Adventure, ScienceFiction, basedonnov..."
...,...,...,...
4804,El Mariachi,9367,"[Action, Crime, Thriller, unitedstates–mexicob..."
4805,Newlyweds,72766,"[Comedy, Romance, A, newlywed, couple's, honey..."
4806,"Signed, Sealed, Delivered",231617,"[Comedy, Drama, Romance, TVMovie, date, loveat..."
4807,Shanghai Calling,126186,"[When, ambitious, New, York, attorney, Sam, is..."


In [87]:
data.tags = data.tags.apply(lambda x : " ".join(x))

In [88]:
data.head()

Unnamed: 0,title,movie_id,tags
0,Avatar,19995,Action Adventure Fantasy ScienceFiction cultur...
1,Pirates of the Caribbean: At World's End,285,Adventure Fantasy Action ocean drugabuse exoti...
2,Spectre,206647,Action Adventure Crime spy basedonnovel secret...
3,The Dark Knight Rises,49026,Action Crime Drama Thriller dccomics crimefigh...
4,John Carter,49529,Action Adventure ScienceFiction basedonnovel m...


In [89]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [90]:
def fun_stem(text):
    ans = []
    for i in text.split(" "):
        ans.append(ps.stem(i))
    return " ".join(ans)

In [91]:
data.tags = data.tags.apply(fun_stem)

# Model Training

In [92]:
data.tags[0]

'action adventur fantasi sciencefict cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien tribe alienplanet cgi marin soldier battl loveaffair antiwar powerrel mindandsoul 3d in the 22nd century, a parapleg marin is dispatch to the moon pandora on a uniqu mission, but becom torn between follow order and protect an alien civilization. samworthington zoesaldana sigourneyweav stephenlang michellerodriguez jamescameron ingeniousfilmpartn twentiethcenturyfoxfilmcorpor duneentertain lightstormentertain'

In [93]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words="english")

In [94]:
vectors = cv.fit_transform(data.tags).toarray()

In [95]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [96]:
cv.get_feature_names_out()[:100]

array(['000', '007', '10', '100', '11', '12', '13', '14', '1492pictur',
       '15', '16', '17', '18', '18th', '19', '1910', '1920', '1930',
       '1940', '1950', '1950s', '1960', '1960s', '1970', '1970s', '1980',
       '1985', '1990', '19th', '19thcenturi', '20', '200', '2009', '20th',
       '21lapsentertain', '24', '25', '2929product', '30', '300',
       '3artsentertain', '3d', '40', '40acres', '50', '500', '60', '70',
       'a24', 'aaron', 'aaroneckhart', 'aarontaylor', 'abandon', 'abduct',
       'abigailbreslin', 'abil', 'abl', 'aboard', 'abov', 'abus',
       'academi', 'accept', 'access', 'accid', 'accident', 'acclaim',
       'accompani', 'accomplish', 'account', 'accus', 'ace', 'achiev',
       'act', 'action', 'actionhero', 'activ', 'activist', 'activities',
       'actor', 'actress', 'actual', 'adam', 'adambrodi', 'adamsandl',
       'adamscott', 'adamshankman', 'adapt', 'add', 'addict', 'adjust',
       'admir', 'admit', 'adolesc', 'adopt', 'ador', 'adrienbrodi',
     

In [97]:
from sklearn.metrics.pairwise import cosine_similarity

In [98]:
similarity = cosine_similarity(vectors)
similarity

array([[1.        , 0.07694838, 0.08134892, ..., 0.04472136, 0.        ,
        0.        ],
       [0.07694838, 1.        , 0.05564149, ..., 0.02294157, 0.        ,
        0.02473853],
       [0.08134892, 0.05564149, 1.        , ..., 0.02425356, 0.        ,
        0.        ],
       ...,
       [0.04472136, 0.02294157, 0.02425356, ..., 1.        , 0.04264014,
        0.04313311],
       [0.        , 0.        , 0.        , ..., 0.04264014, 1.        ,
        0.0919601 ],
       [0.        , 0.02473853, 0.        , ..., 0.04313311, 0.0919601 ,
        1.        ]])

In [99]:
similarity

array([[1.        , 0.07694838, 0.08134892, ..., 0.04472136, 0.        ,
        0.        ],
       [0.07694838, 1.        , 0.05564149, ..., 0.02294157, 0.        ,
        0.02473853],
       [0.08134892, 0.05564149, 1.        , ..., 0.02425356, 0.        ,
        0.        ],
       ...,
       [0.04472136, 0.02294157, 0.02425356, ..., 1.        , 0.04264014,
        0.04313311],
       [0.        , 0.        , 0.        , ..., 0.04264014, 1.        ,
        0.0919601 ],
       [0.        , 0.02473853, 0.        , ..., 0.04313311, 0.0919601 ,
        1.        ]])

In [100]:
similarity[0]

array([1.        , 0.07694838, 0.08134892, ..., 0.04472136, 0.        ,
       0.        ])

In [101]:
similarity.shape

(4809, 4809)

In [102]:
sorted(list(enumerate(similarity[0])), key= lambda X : X[1], reverse=True)[:20]

[(0, 1.0000000000000002),
 (1214, 0.31246621438986827),
 (507, 0.267600765117601),
 (539, 0.25993762245501817),
 (778, 0.25819888974716115),
 (2405, 0.25643882226608633),
 (61, 0.2480694691784169),
 (3729, 0.24748737341529164),
 (582, 0.23241742005034208),
 (1202, 0.23063280200722128),
 (4047, 0.21622499104693413),
 (2329, 0.21516574145596762),
 (1087, 0.21483446221182986),
 (1916, 0.2145290825802583),
 (220, 0.21380899352993948),
 (1192, 0.21300321680756462),
 (74, 0.21128856368212912),
 (2783, 0.20916500663351886),
 (577, 0.20519567041703082),
 (942, 0.20124611797498107)]

In [104]:
def recommend(movie):
    idx = data.title[data.title == movie].index[0]
    distance = similarity[idx]
    movies_idx_list = sorted(list(enumerate(distance)), reverse=True, key=lambda x:x[1])[1:6]
    for i in movies_idx_list:
        print(data.title[i[0]])

In [105]:
values = sorted(enumerate(similarity[0]), reverse=True, key=lambda x:x[1])[1:6]
values

[(1214, 0.31246621438986827),
 (507, 0.267600765117601),
 (539, 0.25993762245501817),
 (778, 0.25819888974716115),
 (2405, 0.25643882226608633)]

In [106]:
n = data.title[0]
print(n)
recommend(n)

Avatar
Aliens vs Predator: Requiem
Independence Day
Titan A.E.
Meet Dave
Aliens


In [107]:
def similar_movie_list(index):
    return [i[0] for i in sorted(list(enumerate(similarity[index])), key=lambda X : X[1], reverse=True)[:20]]

In [108]:
optimized_data = data.copy()

In [109]:
optimized_data["recommend_index"] = [i for i in range(data.shape[0])]

In [111]:
optimized_data['recommend_index'] = optimized_data['recommend_index'].apply(similar_movie_list)

In [126]:
def new_recommend(movie):
    idx = optimized_data.title[optimized_data.title == movie].index[0]
    index_list = optimized_data.recommend_index[idx]
    for i in index_list[1:8]:
        print(optimized_data['title'][i])

In [127]:
new_recommend('Avatar')

Aliens vs Predator: Requiem
Independence Day
Titan A.E.
Meet Dave
Aliens
Jupiter Ascending
Falcon Rising


In [112]:
optimized_data.head()

Unnamed: 0,title,movie_id,tags,recommend_index
0,Avatar,19995,action adventur fantasi sciencefict culturecla...,"[0, 1214, 507, 539, 778, 2405, 61, 3729, 582, ..."
1,Pirates of the Caribbean: At World's End,285,adventur fantasi action ocean drugabus exotici...,"[1, 12, 199, 17, 3571, 187, 216, 2131, 848, 57..."
2,Spectre,206647,action adventur crime spi basedonnovel secreta...,"[2, 11, 29, 1344, 4076, 3339, 1745, 1719, 4345..."
3,The Dark Knight Rises,49026,action crime drama thriller dccomic crimefight...,"[3, 65, 119, 428, 1361, 299, 1360, 210, 3594, ..."
4,John Carter,49529,action adventur sciencefict basedonnovel mar m...,"[4, 3092, 1320, 3376, 610, 1255, 27, 939, 2429..."


In [113]:
data.head()

Unnamed: 0,title,movie_id,tags
0,Avatar,19995,action adventur fantasi sciencefict culturecla...
1,Pirates of the Caribbean: At World's End,285,adventur fantasi action ocean drugabus exotici...
2,Spectre,206647,action adventur crime spi basedonnovel secreta...
3,The Dark Knight Rises,49026,action crime drama thriller dccomic crimefight...
4,John Carter,49529,action adventur sciencefict basedonnovel mar m...


In [114]:
model = pd.DataFrame(similarity)
model

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4799,4800,4801,4802,4803,4804,4805,4806,4807,4808
0,1.000000,0.076948,0.081349,0.069007,0.182574,0.104713,0.038633,0.136931,0.055048,0.106600,...,0.000000,0.000000,0.041169,0.051988,0.000000,0.018634,0.045644,0.044721,0.000000,0.000000
1,0.076948,1.000000,0.055641,0.035400,0.093659,0.107434,0.039637,0.117073,0.056478,0.087496,...,0.000000,0.000000,0.021119,0.026669,0.000000,0.038236,0.000000,0.022942,0.000000,0.024739
2,0.081349,0.055641,1.000000,0.056136,0.074261,0.090862,0.020952,0.123768,0.059708,0.092499,...,0.083189,0.000000,0.000000,0.000000,0.017065,0.060634,0.000000,0.024254,0.000000,0.000000
3,0.069007,0.035400,0.056136,1.000000,0.031497,0.057807,0.053319,0.062994,0.056980,0.220684,...,0.026463,0.027277,0.056819,0.053812,0.021713,0.064293,0.000000,0.030861,0.049346,0.083195
4,0.182574,0.093659,0.074261,0.031497,1.000000,0.114708,0.070535,0.187500,0.075378,0.097312,...,0.035007,0.000000,0.075165,0.023729,0.143621,0.153093,0.000000,0.020412,0.087039,0.044023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4804,0.018634,0.038236,0.060634,0.064293,0.153093,0.093659,0.028796,0.136083,0.123091,0.031782,...,0.000000,0.058926,0.138086,0.058124,0.257986,1.000000,0.000000,0.000000,0.159901,0.125805
4805,0.045644,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.072169,0.000000,0.000000,0.028724,0.000000,1.000000,0.122474,0.000000,0.000000
4806,0.044721,0.022942,0.024254,0.030861,0.020412,0.018732,0.000000,0.000000,0.049237,0.019069,...,0.068599,0.035355,0.018411,0.046499,0.014072,0.000000,0.122474,1.000000,0.042640,0.043133
4807,0.000000,0.000000,0.000000,0.049346,0.087039,0.059904,0.036835,0.043519,0.104973,0.020328,...,0.000000,0.037689,0.098134,0.000000,0.150008,0.159901,0.000000,0.042640,1.000000,0.091960


In [115]:
#data.to_csv("movie_data.csv")
#model.to_csv("movie_model.csv")

In [116]:
import pickle

In [120]:
# pickle.dump(data.to_dict(), open('movie_dict.pkl', 'wb'))
# pickle.dump(model.to_dict(), open('model_dict.pkl', 'wb'))
# pickle.dump(optimized_data.to_dict(), open('optimized_movie_dict.pkl', 'wb'))

In [118]:
# api = https://api.themoviedb.org/3/movie/19995?api_key=d503008795377effc30c0fe86fda10c4&language=en-US

In [119]:
data.movie_id

0        19995
1          285
2       206647
3        49026
4        49529
         ...  
4804      9367
4805     72766
4806    231617
4807    126186
4808     25975
Name: movie_id, Length: 4809, dtype: int64