In [1]:
# 0. Importacion de librerias
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score


In [2]:
# 1. Carga de datos
ratings_df = pd.read_csv('data/ratings.csv')
movies_df = pd.read_csv('data/movies.csv')

In [3]:
# 2. Explorar y limpiar los datos
print(ratings_df.info())
print(movies_df.info())

# Imprimir cabeceras

print(ratings_df.head())
print(movies_df.head())

# Imprimir columnas

print(ratings_df.columns)
print(movies_df.columns)

# Verificar valores nulos
print(ratings_df.isnull().sum())
print(movies_df.isnull().sum())

# Eliminar filas con valores nulos si existen
ratings_df = ratings_df.dropna()
movies_df = movies_df.dropna()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
None
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931
   movieId   

In [4]:
# 3. Implementar algoritmo de filtrado colaborativo basado en ítems

# Crear matriz de usuarios-películas
user_movie_ratings = ratings_df.pivot(index='userId', columns='movieId', values='rating').fillna(0)

# Calcular la similitud entre películas
movie_similarity = cosine_similarity(user_movie_ratings.T)

In [5]:
# 4. Evaluar el rendimiento del sistema

# Dividir los datos en conjuntos de entrenamiento y prueba
train_data, test_data = train_test_split(ratings_df, test_size=0.2, random_state=42)

# Función para obtener recomendaciones
def get_recommendations(user_id, n=5):
  user_ratings = user_movie_ratings.loc[user_id]
  similar_scores = pd.DataFrame(movie_similarity, columns=user_movie_ratings.columns, index=user_movie_ratings.columns)
  similar_movies = similar_scores.mul(user_ratings, axis=1).sum().sort_values(ascending=False)
  return similar_movies.index[:n].tolist()

# Calcular precisión y exhaustividad
def calculate_metrics(test_data):
  true_positives = 0
  false_positives = 0
  false_negatives = 0

  for user_id in test_data['userId'].unique():
      user_test_ratings = test_data[test_data['userId'] == user_id]
      recommended_movies = get_recommendations(user_id)

      for movie in recommended_movies:
          if movie in user_test_ratings['movieId'].values:
              true_positives += 1
          else:
              false_positives += 1

      false_negatives += len(set(user_test_ratings['movieId']) - set(recommended_movies))

  precision = true_positives / (true_positives + false_positives)
  recall = true_positives / (true_positives + false_negatives)

  return precision, recall

precision, recall = calculate_metrics(test_data)
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")


Precision: 0.19
Recall: 0.03


In [6]:
# 5. Función de recomendación
def recommend_movies(user_id, n=5):
  recommended_movie_ids = get_recommendations(user_id, n)
  recommended_movies = movies_df[movies_df['movieId'].isin(recommended_movie_ids)]
  return recommended_movies[['title', 'genres']].to_dict('records')

In [7]:
# 6. Ejemplo de uso
user_id = 1
recommendations = recommend_movies(user_id)
print(f"Recomendaciones para el usuario {user_id}:")
for movie in recommendations:
  print(f"- {movie['title']} ({movie['genres']})")

# Created/Modified files during execution:
print("No se crearon ni modificaron archivos durante la ejecución.")

Recomendaciones para el usuario 1:
- Reservoir Dogs (1992) (Crime|Mystery|Thriller)
- Big Lebowski, The (1998) (Comedy|Crime)
- Ghostbusters (a.k.a. Ghost Busters) (1984) (Action|Comedy|Sci-Fi)
- Who Framed Roger Rabbit? (1988) (Adventure|Animation|Children|Comedy|Crime|Fantasy|Mystery)
- Spaceballs (1987) (Comedy|Sci-Fi)
No se crearon ni modificaron archivos durante la ejecución.


Resolucion mejorada implementando SVD

In [8]:
# 0. Importacion de librerias

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score
from scipy.sparse.linalg import svds

In [9]:
# 1. Carga de datos
ratings_df = pd.read_csv('data/ratings.csv')
movies_df = pd.read_csv('data/movies.csv')

In [10]:
# 2. Explorar y limpiar los datos
print(ratings_df.info())
print(movies_df.info())

# Imprimir cabeceras

print(ratings_df.head())
print(movies_df.head())

# Imprimir columnas

print(ratings_df.columns)
print(movies_df.columns)

# Verificar valores nulos
print(ratings_df.isnull().sum())
print(movies_df.isnull().sum())

# Eliminar filas con valores nulos si existen
ratings_df = ratings_df.dropna()
movies_df = movies_df.dropna()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
None
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931
   movieId   

In [11]:
# 3. Crear matriz de usuarios-películas
user_movie_ratings = ratings_df.pivot(index='userId', columns='movieId', values='rating').fillna(0)

In [12]:
# 4. Normalizar las calificaciones
user_ratings_mean = np.mean(user_movie_ratings.values, axis=1)
user_movie_ratings_demeaned = user_movie_ratings.values - user_ratings_mean.reshape(-1, 1)

In [13]:
# 5. Realizar SVD
U, sigma, Vt = svds(user_movie_ratings_demeaned, k=50)

In [14]:
# 6. Reconstruir la matriz de predicciones
sigma_diag_matrix = np.diag(sigma)
predicted_ratings = np.dot(np.dot(U, sigma_diag_matrix), Vt) + user_ratings_mean.reshape(-1, 1)
preds_df = pd.DataFrame(predicted_ratings, columns=user_movie_ratings.columns, index=user_movie_ratings.index)

def recommend_movies(user_id, num_recommendations=5):
  user_row = preds_df.loc[user_id].sort_values(ascending=False)
  already_rated = set(ratings_df[ratings_df['userId'] == user_id]['movieId'])
  recommendations = []
  for movie_id in user_row.index:
      if movie_id not in already_rated:
          recommendations.append(movie_id)
          if len(recommendations) == num_recommendations:
              break
  return recommendations

In [15]:
# 7. Calcular precision y exahustividad del modelo
def evaluate_model(test_data, threshold=3.5):
  true_positives = 0
  false_positives = 0
  false_negatives = 0

  for _, row in test_data.iterrows():
      user_id = row['userId']
      movie_id = row['movieId']
      actual_rating = row['rating']
      predicted_rating = preds_df.loc[user_id, movie_id]

      if predicted_rating >= threshold and actual_rating >= threshold:
          true_positives += 1
      elif predicted_rating >= threshold and actual_rating < threshold:
          false_positives += 1
      elif predicted_rating < threshold and actual_rating >= threshold:
          false_negatives += 1

  precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
  recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0

  return precision, recall

# Dividir datos en entrenamiento y prueba
train_data, test_data = train_test_split(ratings_df, test_size=0.2, random_state=42)

# Evaluar el modelo
precision, recall = evaluate_model(test_data)
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")

Precision: 0.96
Recall: 0.28


In [16]:
# 8. Ejemplo de uso
user_id = 1
recommended_movie_ids = recommend_movies(user_id)
recommended_movies = movies_df[movies_df['movieId'].isin(recommended_movie_ids)]
print(f"\nRecomendaciones para el usuario {user_id}:")
for _, movie in recommended_movies.iterrows():
  print(f"- {movie['title']} ({movie['genres']})")

# Created/Modified files during execution:
print("No se crearon ni modificaron archivos durante la ejecución.")


Recomendaciones para el usuario 1:
- Godfather, The (1972) (Crime|Drama)
- Die Hard (1988) (Action|Crime|Thriller)
- Godfather: Part II, The (1974) (Crime|Drama)
- Jaws (1975) (Action|Horror)
- Breakfast Club, The (1985) (Comedy|Drama)
No se crearon ni modificaron archivos durante la ejecución.
