# K VECINOS CERCANOS

In [1]:
import pandas as pd

# URLs de los datasets
movies_url = "https://raw.githubusercontent.com/4GeeksAcademy/k-nearest-neighbors-project-tutorial/main/tmdb_5000_movies.csv"
credits_url = "https://raw.githubusercontent.com/4GeeksAcademy/k-nearest-neighbors-project-tutorial/main/tmdb_5000_credits.csv"

# Carga de los datasets en DataFrames
movies_df = pd.read_csv(movies_url)
credits_df = pd.read_csv(credits_url)

print(movies_df.head())
print(credits_df.head())


      budget                                             genres  \
0  237000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
1  300000000  [{"id": 12, "name": "Adventure"}, {"id": 14, "...   
2  245000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
3  250000000  [{"id": 28, "name": "Action"}, {"id": 80, "nam...   
4  260000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   

                                       homepage      id  \
0                   http://www.avatarmovie.com/   19995   
1  http://disney.go.com/disneypictures/pirates/     285   
2   http://www.sonypictures.com/movies/spectre/  206647   
3            http://www.thedarkknightrises.com/   49026   
4          http://movies.disney.com/john-carter   49529   

                                            keywords original_language  \
0  [{"id": 1463, "name": "culture clash"}, {"id":...                en   
1  [{"id": 270, "name": "ocean"}, {"id": 726, "na...                en   
2  [{"id": 470, "nam

In [2]:
# Unir los DataFrames en base al título de la película
merged_df = pd.merge(movies_df, credits_df, left_on='title', right_on='title')

# Selección de las columnas requeridas
columns_to_keep = ['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']
merged_df = merged_df[columns_to_keep]

print(merged_df.head())


   movie_id                                     title  \
0     19995                                    Avatar   
1       285  Pirates of the Caribbean: At World's End   
2    206647                                   Spectre   
3     49026                     The Dark Knight Rises   
4     49529                               John Carter   

                                            overview  \
0  In the 22nd century, a paraplegic Marine is di...   
1  Captain Barbossa, long believed to be dead, ha...   
2  A cryptic message from Bond’s past sends him o...   
3  Following the death of District Attorney Harve...   
4  John Carter is a war-weary, former military ca...   

                                              genres  \
0  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
1  [{"id": 12, "name": "Adventure"}, {"id": 14, "...   
2  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
3  [{"id": 28, "name": "Action"}, {"id": 80, "nam...   
4  [{"id": 28, "name": "Action"}, {"id":

In [4]:
import pandas as pd
import ast

# URLs de los datasets
movies_url = "https://raw.githubusercontent.com/4GeeksAcademy/k-nearest-neighbors-project-tutorial/main/tmdb_5000_movies.csv"
credits_url = "https://raw.githubusercontent.com/4GeeksAcademy/k-nearest-neighbors-project-tutorial/main/tmdb_5000_credits.csv"

# Carga de los datasets en DataFrames
movies_df = pd.read_csv(movies_url)
credits_df = pd.read_csv(credits_url)

# Unir los DataFrames en base al título de la película
merged_df = pd.merge(movies_df, credits_df, left_on='title', right_on='title')

# Selección de las columnas requeridas
columns_to_keep = ['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']
merged_df = merged_df[columns_to_keep]

def extract_names(json_str):
    """Extrae los nombres de una cadena JSON."""
    if pd.isna(json_str):
        return []
    names = [item['name'] for item in ast.literal_eval(json_str)]
    return names

def extract_director(crew_str):
    """Extrae el nombre del director del equipo."""
    if pd.isna(crew_str):
        return ''
    for item in ast.literal_eval(crew_str):
        if item['job'] == 'Director':
            return item['name']
    return ''

def extract_top_cast(cast_str, top_n=3):
    """Extrae los primeros `top_n` nombres del reparto."""
    if pd.isna(cast_str):
        return []
    names = [item['name'] for item in ast.literal_eval(cast_str)[:top_n]]
    return names

# Transformación de las columnas JSON
merged_df['genres'] = merged_df['genres'].apply(extract_names)
merged_df['keywords'] = merged_df['keywords'].apply(extract_names)
merged_df['cast'] = merged_df['cast'].apply(lambda x: extract_top_cast(x, 3))
merged_df['crew'] = merged_df['crew'].apply(extract_director)

# Conversión del resumen a lista de palabras
merged_df['overview'] = merged_df['overview'].apply(lambda x: x.split() if pd.notna(x) else [])

# Eliminación de espacios en las palabras
for column in ['genres', 'cast', 'crew', 'keywords']:
    merged_df[column] = merged_df[column].apply(lambda x: [item.replace(" ", "") for item in x] if isinstance(x, list) else x.replace(" ", ""))

# Creación de la columna 'tags'
merged_df['tags'] = merged_df.apply(lambda x: ' '.join(x['overview']) + ' ' +
                                               ' '.join(x['genres']) + ' ' +
                                               ' '.join(x['keywords']) + ' ' +
                                               ' '.join(x['cast']) + ' ' +
                                               x['crew'].replace(" ", ""), axis=1)

print(merged_df[['title', 'tags']].head())


                                      title  \
0                                    Avatar   
1  Pirates of the Caribbean: At World's End   
2                                   Spectre   
3                     The Dark Knight Rises   
4                               John Carter   

                                                tags  
0  In the 22nd century, a paraplegic Marine is di...  
1  Captain Barbossa, long believed to be dead, ha...  
2  A cryptic message from Bond’s past sends him o...  
3  Following the death of District Attorney Harve...  
4  John Carter is a war-weary, former military ca...  


In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Vectorización del texto
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(merged_df['tags']).toarray()

# Cálculo de la similitud del coseno
similarity = cosine_similarity(vectors)

# Función de recomendación
def recommend(movie):
    movie_index = merged_df[merged_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    
    for i in movie_list:
        print(merged_df.iloc[i[0]].title)

# Ejemplo de uso
recommend("Avatar")


Titan A.E.
Small Soldiers
Independence Day
Aliens vs Predator: Requiem
Battle: Los Angeles
