# KNN

## Import Libraries

In [62]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor, NearestNeighbors
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity
from pickle import dump
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sqlalchemy import create_engine
import pandas as pd
import json
import sqlite3

## Import Data

In [63]:
tmdb_5000_movies = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/k-nearest-neighbors-project-tutorial/main/tmdb_5000_movies.csv")

tmdb_5000_credits = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/k-nearest-neighbors-project-tutorial/main/tmdb_5000_credits.csv")

In [64]:
tmdb_5000_movies.head(2)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


In [65]:
tmdb_5000_credits.head(2)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


## Creation of a database

In [66]:
con = sqlite3.connect("movies.db")
cur = con.cursor()

tmdb_5000_movies.to_sql('movies', con, if_exists='replace', index=False)
tmdb_5000_credits.to_sql('credits', con, if_exists='replace', index=False)

4803

In [67]:
query = pd.read_sql_query('''
        SELECT *
        FROM movies
        JOIN credits ON movies.title = credits.title
        ''', con)
con.close()

In [68]:
total_data = query[["movie_id", "title", "overview", "genres", "keywords", "cast", "crew"]]

In [69]:
total_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4809 entries, 0 to 4808
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4809 non-null   int64 
 1   title     4809 non-null   object
 2   title     4809 non-null   object
 3   overview  4806 non-null   object
 4   genres    4809 non-null   object
 5   keywords  4809 non-null   object
 6   cast      4809 non-null   object
 7   crew      4809 non-null   object
dtypes: int64(1), object(7)
memory usage: 300.7+ KB


In [70]:
total_data = total_data.T.drop_duplicates().T

In [71]:
total_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4809 entries, 0 to 4808
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4809 non-null   object
 1   title     4809 non-null   object
 2   overview  4806 non-null   object
 3   genres    4809 non-null   object
 4   keywords  4809 non-null   object
 5   cast      4809 non-null   object
 6   crew      4809 non-null   object
dtypes: object(7)
memory usage: 263.1+ KB


In [72]:
total_data.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [73]:
def json_value(json_str, default_value = None):
    try:
        return json.loads(json_str) 
    except json.JSONDecodeError:
        return None

In [74]:
total_data["genres"] = total_data["genres"].apply(lambda x: [item["name"] for item in json.loads(x)] if pd.notna(x) else None)
total_data["keywords"] = total_data["keywords"].apply(lambda x: [item["name"] for item in json.loads(x)] if pd.notna(x) else None)
total_data["cast"] = total_data["cast"].apply(lambda x: [item["name"] for item in json.loads(x)[:3]] if pd.notna(x) else None)
total_data["crew"] = total_data["crew"].apply(lambda x: [item["name"] for item in json_value(x) if item["job"] == "Director"] if pd.notna(x) else None)
total_data["overview"] = total_data["overview"].apply(lambda x: [x])


In [75]:
total_data.head(5)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In the 22nd century, a paraplegic Marine is d...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain Barbossa, long believed to be dead, h...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,Spectre,[A cryptic message from Bond’s past sends him ...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]
3,49026,The Dark Knight Rises,[Following the death of District Attorney Harv...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan]
4,49529,John Carter,"[John Carter is a war-weary, former military c...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton]


In [76]:
remove_spaces = lambda items: [item.replace(" ", "") for item in items] if isinstance(items, list) else items

In [77]:
total_data["genres"] = total_data["genres"].apply(remove_spaces)
total_data["cast"] = total_data["cast"].apply(remove_spaces)
total_data["crew"] = total_data["crew"].apply(remove_spaces)
total_data["keywords"] = total_data["keywords"].apply(remove_spaces)

In [78]:
total_data["tags"] = total_data["overview"] + total_data["genres"] + total_data["keywords"] + total_data["cast"] + total_data["crew"] 

In [79]:
def concatenate_list(lists):
    cleaned_list = [x for x in lists if isinstance(x, str) and x is not None]
    return ' '.join(cleaned_list)

In [80]:
total_data["tags"] = total_data["tags"].apply(concatenate_list)

In [81]:
total_data["tags"][0]

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. Action Adventure Fantasy ScienceFiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d SamWorthington ZoeSaldana SigourneyWeaver JamesCameron'

In [82]:
vectorizer = TfidfVectorizer()

vectors_movies = vectorizer.fit_transform(total_data["tags"])


In [83]:
model = NearestNeighbors(n_neighbors=6, algorithm="brute", metric="cosine")
model.fit(vectors_movies)

In [105]:
def movie_recommend(movie):
    movie_index = total_data[total_data["title"] == movie].index
    if not movie_index.empty:
        movie_index = movie_index[0]
        distances, indices = model.kneighbors(vectors_movies[movie_index])
        movie_list = [(total_data["title"][i], distances[0][j]) for j, i in enumerate(indices[0])]
        return movie_list[1:]
    else:
        print(f"The movie '{movie}' is not found in the dataset.")
        return None

In [110]:
recommended_movies = movie_recommend("Interstellar")

if recommended_movies:
    for movie, distance in recommended_movies:
        print(f"- film: {movie}")

- film: Transformers: Revenge of the Fallen
- film: Transformers: Age of Extinction
- film: Transformers: Dark of the Moon
- film: I Am Sam
- film: Brothers


In [115]:
recommended_movies = movie_recommend("The Flash")

if recommended_movies:
    for movie, distance in recommended_movies:
        print(f"- film: {movie}")

The movie 'The Flash' is not found in the dataset.
