## Filtragem Baseada em conteúdo (cosseno similaridade)

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import matplotlib.pyplot as plt


In [2]:
# Load movies data
movies = pd.read_csv(
    'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.item',
    sep='|', encoding='latin-1', header=None)

# Assign column names
movies.columns = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url',
                  'unknown', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime',
                  'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
                  'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']


In [3]:
# List of genre columns
genre_cols = ['unknown', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime',
              'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
              'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

# Combine genres into a single string
movies['genres'] = movies[genre_cols].apply(lambda x: ' '.join([genre for genre in genre_cols if x[genre] == 1]), axis=1)

# Combine title and genres into a single content feature
movies['content'] = movies['title'] + ' ' + movies['genres']

movies[['movie_id', 'title', 'genres', 'content']].head()


Unnamed: 0,movie_id,title,genres,content
0,1,Toy Story (1995),Animation Children's Comedy,Toy Story (1995) Animation Children's Comedy
1,2,GoldenEye (1995),Action Adventure Thriller,GoldenEye (1995) Action Adventure Thriller
2,3,Four Rooms (1995),Thriller,Four Rooms (1995) Thriller
3,4,Get Shorty (1995),Action Comedy Drama,Get Shorty (1995) Action Comedy Drama
4,5,Copycat (1995),Crime Drama Thriller,Copycat (1995) Crime Drama Thriller


In [8]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['genres'])

In [9]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [10]:
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

def recommend_movies(title, cosine_sim=cosine_sim):
    if title not in indices:
        return "Title not found in the dataset."
    
    idx = indices[title]
    # Get pairwise similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Sort movies based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get scores of the top 5 similar movies
    sim_scores = sim_scores[1:6]
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    # Return the top 5 most similar movies
    return movies['title'].iloc[movie_indices]


In [16]:
recommendations = recommend_movies('Toy Story (1995)')
print("Recommendations for 'Toy Story (1995)':")
print(recommendations)

Recommendations for 'Toy Story (1995)':
421    Aladdin and the King of Thieves (1996)
101                    Aristocats, The (1970)
403                          Pinocchio (1940)
624            Sword in the Stone, The (1963)
945             Fox and the Hound, The (1981)
Name: title, dtype: object


In [None]:
# Get similarity scores for the test movie
test_movie_idx = indices['Toy Story (1995)']
sim_scores = list(enumerate(cosine_sim[test_movie_idx]))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:11]

# Plot
scores = [score for idx, score in sim_scores]
movie_titles = movies['title'].iloc[[idx for idx, score in sim_scores]]

plt.barh(movie_titles, scores)
plt.xlabel('Similarity Score')
plt.title('Top 10 Similar Movies to Toy Story (1995)')
plt.gca().invert_yaxis()
plt.show()


## Filtragem Colaborativa (cosseno similaridade)

In [17]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt


In [18]:
# Load ratings data
ratings = pd.read_csv('https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.data', sep='\t', header=None)
ratings.columns = ['user_id', 'movie_id', 'rating', 'timestamp']

# Load movies data
movies = pd.read_csv('https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.item', sep='|', encoding='latin-1', header=None)
movies = movies[[0, 1]]
movies.columns = ['movie_id', 'title']

# Merge datasets
data = pd.merge(ratings, movies, on='movie_id')
data.head()


Unnamed: 0,user_id,movie_id,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,186,302,3,891717742,L.A. Confidential (1997)
2,22,377,1,878887116,Heavyweights (1994)
3,244,51,2,880606923,Legends of the Fall (1994)
4,166,346,1,886397596,Jackie Brown (1997)


In [19]:
user_item_matrix = data.pivot_table(index='user_id', columns='title', values='rating')
user_item_matrix.fillna(0, inplace=True)
user_item_matrix.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,2.0,5.0,0.0,0.0,3.0,4.0,0.0,0.0,...,0.0,0.0,0.0,5.0,3.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,2.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,4.0,0.0


In [20]:
user_similarity = cosine_similarity(user_item_matrix)
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

In [21]:
def recommend_movies(user_id, num_recommendations=5):
    # Get user's ratings
    user_ratings = user_item_matrix.loc[user_id]
    # Find similar users
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:]
    # Reindex user_item_matrix.T columns to match similar_users index
    user_item_matrix_T = user_item_matrix.T.reindex(columns=similar_users.index)
    # Compute weighted ratings
    weighted_ratings = user_item_matrix_T.dot(similar_users) / similar_users.sum()
    # Recommend movies not already rated by the user
    recommendations = weighted_ratings[~user_ratings.index.isin(user_ratings[user_ratings > 0].index)]
    # Return top N recommendations
    return recommendations.sort_values(ascending=False).head(num_recommendations)



In [22]:
user_id = 1
recommendations = recommend_movies(user_id)
print(f"Recommendations for User {user_id}:")
print(recommendations)

Recommendations for User 1:
title
Schindler's List (1993)                   2.046538
E.T. the Extra-Terrestrial (1982)         1.881429
One Flew Over the Cuckoo's Nest (1975)    1.802253
English Patient, The (1996)               1.749848
Scream (1996)                             1.701985
dtype: float64


## Redes Neurais (Híbrido)

In [26]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Keras modules
from keras.models import Model
from keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Dropout
from keras.optimizers import Adam


2024-12-03 00:17:16.799791: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-03 00:17:16.802564: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-03 00:17:16.809063: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1733195836.820006    4025 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1733195836.823059    4025 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-03 00:17:16.836315: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

In [35]:
# Load ratings data
ratings = pd.read_csv(
    'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.data',
    sep='\t', header=None, names=['user_id', 'movie_id', 'rating', 'timestamp'])

# Load users data
users = pd.read_csv(
    'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user',
    sep='|', header=None, names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])

# Load movies data
movies = pd.read_csv(
    'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.item',
    sep='|', encoding='latin-1', header=None,
    names=['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url',
           'unknown', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime',
           'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
           'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'])


In [50]:
# Convert user_id and movie_id to strings for consistency
ratings['user_id'] = ratings['user_id'].astype(str)
users['user_id'] = users['user_id'].astype(str)

ratings['movie_id'] = ratings['movie_id'].astype(str)
movies['movie_id'] = movies['movie_id'].astype(str)

# Merge ratings with users and movies
data = ratings.merge(users, on='user_id').merge(movies, on='movie_id')

# Encode gender
gender_encoder = LabelEncoder()
data['gender_enc'] = gender_encoder.fit_transform(data['gender'])

# Encode occupation
occupation_encoder = LabelEncoder()
data['occupation_enc'] = occupation_encoder.fit_transform(data['occupation'])

# Scale age between 0 and 1
age_scaler = MinMaxScaler()
data['age_scaled'] = age_scaler.fit_transform(data[['age']])

# Use the genre columns as movie features
genre_cols = ['unknown', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime',
              'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
              'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

movie_features = genre_cols  # List of genre columns

# Encode user_id and movie_id
user_id_encoder = LabelEncoder()
data['user_id_enc'] = user_id_encoder.fit_transform(data['user_id'])

movie_id_encoder = LabelEncoder()
data['movie_id_enc'] = movie_id_encoder.fit_transform(data['movie_id'])

# Features
user_features = ['gender_enc', 'occupation_enc', 'age_scaled']

# Target variable
target = 'rating'

# Final feature set
feature_columns = ['user_id_enc', 'movie_id_enc'] + user_features + movie_features
X = data[feature_columns]
y = data[target]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)




In [51]:
num_users = data['user_id_enc'].nunique()
num_movies = data['movie_id_enc'].nunique()
num_genders = data['gender_enc'].nunique()
num_occupations = data['occupation_enc'].nunique()

# User ID Input
user_id_input = Input(shape=(1,), name='user_id_input')
user_embedding = Embedding(input_dim=num_users, output_dim=50, input_length=1, name='user_embedding')(user_id_input)
user_vec = Flatten(name='user_flatten')(user_embedding)

# User Features Input
gender_input = Input(shape=(1,), name='gender_input')
occupation_input = Input(shape=(1,), name='occupation_input')
age_input = Input(shape=(1,), name='age_input')

# Concatenate User Features
user_features_input = Concatenate(name='user_features')([user_vec, gender_input, occupation_input, age_input])

# Movie ID Input
movie_id_input = Input(shape=(1,), name='movie_id_input')
movie_embedding = Embedding(input_dim=num_movies, output_dim=50, input_length=1, name='movie_embedding')(movie_id_input)
movie_vec = Flatten(name='movie_flatten')(movie_embedding)

# Movie Genres Input
genre_input = Input(shape=(len(movie_features),), name='genre_input')

# Concatenate Movie Features
movie_features_input = Concatenate(name='movie_features')([movie_vec, genre_input])

# Concatenate all features
combined_features = Concatenate(name='combined_features')([user_features_input, movie_features_input])

dense = Dense(128, activation='relu')(combined_features)
dense = Dropout(0.5)(dense)
dense = Dense(64, activation='relu')(dense)
dense = Dropout(0.5)(dense)
dense = Dense(32, activation='relu')(dense)
output = Dense(1)(dense)

model = Model(inputs=[user_id_input, gender_input, occupation_input, age_input, movie_id_input, genre_input], outputs=output)
model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')






In [56]:
train_inputs = {
    'user_id_input': X_train['user_id_enc'],
    'gender_input': X_train['gender_enc'],
    'occupation_input': X_train['occupation_enc'],
    'age_input': X_train['age_scaled'],
    'movie_id_input': X_train['movie_id_enc'],
    'genre_input': X_train[movie_features].values
}

test_inputs = {
    'user_id_input': X_test['user_id_enc'],
    'gender_input': X_test['gender_enc'],
    'occupation_input': X_test['occupation_enc'],
    'age_input': X_test['age_scaled'],
    'movie_id_input': X_test['movie_id_enc'],
    'genre_input': X_test[movie_features].values
}

history = model.fit(
    train_inputs,
    y_train,
    validation_data=(test_inputs, y_test),
    epochs=5,  # Reduced epochs for brevity
    batch_size=256,
    verbose=1
)


Epoch 1/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.9183 - val_loss: 1.5227
Epoch 2/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.8975 - val_loss: 1.3758
Epoch 3/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.8698 - val_loss: 1.1928
Epoch 4/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.8580 - val_loss: 1.1390
Epoch 5/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.8497 - val_loss: 1.0856


In [57]:
def recommend_movies(user_id, num_recommendations=5):
    # Ensure user_id is a string (consistent with data)
    user_id = str(user_id)
    
    # Check if user_id exists in the data
    if user_id not in users['user_id'].values:
        print("User ID not found.")
        return
    
    # Get the encoded user ID
    user_idx = user_id_encoder.transform([user_id])[0]
    
    # Get user features
    user_data = users[users['user_id'] == user_id]
    gender_enc = gender_encoder.transform(user_data['gender'])[0]
    occupation_enc = occupation_encoder.transform(user_data['occupation'])[0]
    age_scaled = age_scaler.transform([[user_data['age'].values[0]]])[0][0]
    
    # Get movies the user hasn't rated yet
    user_rated_movies = ratings[ratings['user_id'] == user_id]['movie_id'].unique()
    all_movies = movies['movie_id'].unique()
    unrated_movies = np.setdiff1d(all_movies, user_rated_movies)
    
    # Prepare inputs for prediction
    movie_ids = unrated_movies
    movie_idxs = movie_id_encoder.transform(movie_ids)
    genre_data = movies[movies['movie_id'].isin(movie_ids)][movie_features]
    
    # Create model inputs
    num_unrated_movies = len(movie_ids)
    user_id_input = np.array([user_idx] * num_unrated_movies)
    gender_input = np.array([gender_enc] * num_unrated_movies)
    occupation_input = np.array([occupation_enc] * num_unrated_movies)
    age_input = np.array([age_scaled] * num_unrated_movies)
    movie_id_input = movie_idxs
    genre_input = genre_data.values
    
    pred_inputs = {
        'user_id_input': user_id_input,
        'gender_input': gender_input,
        'occupation_input': occupation_input,
        'age_input': age_input,
        'movie_id_input': movie_id_input,
        'genre_input': genre_input
    }
    
    # Predict ratings
    preds = model.predict(pred_inputs).flatten()
    
    # Create a DataFrame with predictions
    pred_df = pd.DataFrame({
        'movie_id': movie_ids,
        'pred_rating': preds
    })
    
    # Merge with movie titles
    recommendations = pred_df.merge(movies[['movie_id', 'title']], on='movie_id')
    
    # Return top N recommendations
    recommendations = recommendations.sort_values('pred_rating', ascending=False).head(num_recommendations)
    return recommendations[['title', 'pred_rating']]


In [61]:
user_id = '1'  # Ensure user_id is a string
recommendations = recommend_movies(user_id, num_recommendations=5)
print(f"Top 5 movie recommendations for User {user_id}:")
print(recommendations)


[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Top 5 movie recommendations for User 1:
                         title  pred_rating
500  Santa with Muscles (1996)     3.798571
367               Faust (1994)     3.786911
642   Some Mother's Son (1996)     3.783856
368     Mina Tannenbaum (1994)     3.753992
623       Cérémonie, La (1995)     3.730782


