In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from ast import literal_eval

tmdb_data = pd.read_csv("data_files/TMDB_SomewhatCleaned.csv")
movie_col = tmdb_data.columns

In [2]:
tmdb_data.shape

(4805, 12)

In [3]:
tmdb_data.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew,popularity,runtime,vote_average,vote_count,release_date
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","['Action', 'Adventure', 'Fantasy', 'ScienceFic...","['cultureclash', 'future', 'spacewar', 'spacec...","['SamWorthington', 'ZoeSaldana', 'SigourneyWea...",['JamesCameron'],150.437577,162.0,7.2,11800,2009-12-10
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","['Adventure', 'Fantasy', 'Action']","['ocean', 'drugabuse', 'exoticisland', 'eastin...","['JohnnyDepp', 'OrlandoBloom', 'KeiraKnightley']",['GoreVerbinski'],139.082615,169.0,6.9,4500,2007-05-19
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"['Action', 'Adventure', 'Crime']","['spy', 'basedonnovel', 'secretagent', 'sequel...","['DanielCraig', 'ChristophWaltz', 'LéaSeydoux']",['SamMendes'],107.376788,148.0,6.3,4466,2015-10-26
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"['Action', 'Crime', 'Drama', 'Thriller']","['dccomics', 'crimefighter', 'terrorist', 'sec...","['ChristianBale', 'MichaelCaine', 'GaryOldman']",['ChristopherNolan'],112.31295,165.0,7.6,9106,2012-07-16
4,49529,John Carter,"John Carter is a war-weary, former military ca...","['Action', 'Adventure', 'ScienceFiction']","['basedonnovel', 'mars', 'medallion', 'spacetr...","['TaylorKitsch', 'LynnCollins', 'SamanthaMorton']",['AndrewStanton'],43.926995,132.0,6.1,2124,2012-03-07


In [13]:
rate_data = pd.read_csv("../data_files/10000_rate_with_right_id.csv")
rate_data.columns

Index(['Unnamed: 0', 'DTG', 'cast', 'crew', 'data_or_rate', 'genres',
       'keywords', 'movie_id', 'overview', 'popularity', 'release_date',
       'runtime', 'title', 'user_age', 'user_gender', 'user_id',
       'user_occupation', 'user_rating', 'vote_count', 'vote_average',
       'kafka_id'],
      dtype='object')

In [14]:
rate_data['user_age'].isna().sum()

71

In [5]:
[col for col in rate_data.columns if col not in tmdb_data.columns]

['Unnamed: 0',
 'DTG',
 'data_or_rate',
 'user_age',
 'user_gender',
 'user_id',
 'user_occupation',
 'user_rating',
 'kafka_id']

In [6]:
movie_col = list(movie_col)
movie_col.append('kafka_id')

In [7]:
movie_data = rate_data[movie_col]
movie_data_nonduplicate = movie_data.drop_duplicates(ignore_index=True)
movie_data.shape, movie_data_nonduplicate.shape

((10000, 13), (5878, 13))

In [8]:
movie_data_nonduplicate['genres'][0]

"['Documentary', 'History', 'Music']"

In [9]:
movie_data_nonduplicate.index

RangeIndex(start=0, stop=5878, step=1)

In [14]:
# movie_ids = movie_data['movie_id'].unique()
movie_idx_id = dict([(k, v) for k,v in zip(movie_data_nonduplicate.index, movie_data_nonduplicate['kafka_id'])])
movie_id_idx = dict([(k, v) for k,v in zip(movie_data_nonduplicate['kafka_id'], movie_data_nonduplicate.index)])

In [11]:
movie_data_nonduplicate['overview'] = movie_data_nonduplicate['overview'].fillna('')
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=1, stop_words='english')
tfidf_matrix = tf.fit_transform(movie_data_nonduplicate['overview'])
tfidf_matrix.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_data_nonduplicate['overview'] = movie_data_nonduplicate['overview'].fillna('')


(5878, 172215)

In [1]:
import pickle
with open("content_recommend/tfidf_matrix.pkl", "rb") as f:
    tfidf = pickle.load(f)

In [2]:
tfidf.shape

(5878, 172215)

In [14]:
movie_data_nonduplicate['cast'] = movie_data_nonduplicate['cast'].apply(literal_eval)
movie_data_nonduplicate['crew'] = movie_data_nonduplicate['crew'].apply(literal_eval)
movie_data_nonduplicate['keywords'] = movie_data_nonduplicate['keywords'].apply(literal_eval)
movie_data_nonduplicate['genres'] = movie_data_nonduplicate['genres'].apply(literal_eval)
movie_data_nonduplicate['cast_size'] = movie_data_nonduplicate['cast'].apply(lambda x: len(x))
movie_data_nonduplicate['crew_size'] = movie_data_nonduplicate['crew'].apply(lambda x: len(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_data_nonduplicate['cast'] = movie_data_nonduplicate['cast'].apply(literal_eval)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_data_nonduplicate['crew'] = movie_data_nonduplicate['crew'].apply(literal_eval)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_data_nonduplicate['keywords

In [18]:
s = movie_data_nonduplicate.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'

  s = movie_data_nonduplicate.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)


In [22]:
s = s.value_counts()

In [27]:
s.shape

(4796,)

In [23]:
s = s[s > 1]
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words
movie_data_nonduplicate['keywords'] = movie_data_nonduplicate['keywords'].apply(filter_keywords)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_data_nonduplicate['keywords'] = movie_data_nonduplicate['keywords'].apply(filter_keywords)


In [24]:
movie_data_nonduplicate['soup'] = movie_data_nonduplicate['keywords'] + movie_data_nonduplicate['cast'] + movie_data_nonduplicate['crew'] + movie_data_nonduplicate['genres']
movie_data_nonduplicate['soup'] = movie_data_nonduplicate['soup'].apply(lambda x: ' '.join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_data_nonduplicate['soup'] = movie_data_nonduplicate['keywords'] + movie_data_nonduplicate['cast'] + movie_data_nonduplicate['crew'] + movie_data_nonduplicate['genres']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_data_nonduplicate['soup'] = movie_data_nonduplicate['soup'].apply(lambda x: ' '.join(x))


In [25]:
movie_data_nonduplicate['soup'][0]

'LarsUlrich DeathAngel TomAngelripper RickErnst Documentary History Music'

In [28]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=1, stop_words='english')
count_matrix = count.fit_transform(movie_data_nonduplicate['soup'])
count_matrix.shape

(5870, 70892)

In [61]:
count_matrix[[0,1]].sum(0).shape

(1, 70892)

In [29]:
cosine_sim1 = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [39]:
movie_id_rate = rate_data[rate_data['user_id'] == 797760][['kafka_id', 'user_rating']]
feat = []
for x, y in zip(movie_id_rate['kafka_id'], movie_id_rate['user_rating']):
    rate_norm = (y-3.0)/2.0
    movie_idx = movie_id_idx[x]
    feat.append(tfidf_matrix[movie_idx] * rate_norm)
feat = sum(feat)
cos1 = linear_kernel(tfidf_matrix, feat)
sim_scores = list(enumerate(cos1))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores = sim_scores[1:21]
movie_ids = [movie_idx_id[i[0]] for i in sim_scores]

29124.0 4


In [None]:
from sklearn.metrics import mean_squared_error
def get_y_actual(user_id, rate_data, movie_id_idx, movie_idx_id, tfidf_matrix):
    movie_id_rate = rate_data[rate_data['user_id'] == user_id][['kafka_id', 'user_rating']]
    feat = []
    for x, y in zip(movie_id_rate['kafka_id'], movie_id_rate['user_rating']):
        rate_norm = (y-3.0)/2.0
        movie_idx = movie_id_idx[x]
rms = mean_squared_error(y_actual, y_predicted, squared=False)

In [12]:
def get_recommendations(user_id):
    movie_id_rate = rate_data[rate_data['user_id'] == user_id][['kafka_id', 'user_rating']]
    feat = []
    for x, y in zip(movie_id_rate['kafka_id'], movie_id_rate['user_rating']):
        rate_norm = (y-3.0)/2.0
        movie_idx = movie_id_idx[x]
        feat.append(tfidf_matrix[movie_idx] * rate_norm)
    feat = sum(feat)
    cos1 = linear_kernel(tfidf_matrix, feat)
    sim_scores = list(enumerate(cos1))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    movie_ids = [movie_idx_id[i[0]] for i in sim_scores]
    return movie_ids

In [21]:
movie_id_rate = rate_data[rate_data['user_id'] == user_ids[361]][['movie_id', 'user_rating']]
for x, y in zip(movie_id_rate['movie_id'], movie_id_rate['user_rating']):
    print(movie_id_rate)
    print(movie_id_idx[x])

     movie_id  user_rating
361       NaN            4


KeyError: nan

In [15]:
user_ids = list(rate_data['user_id'].unique())
content_recommendation = dict()
for user_id in user_ids:
    content_recommendation[user_id] = get_recommendations(user_id)

In [17]:
import pickle
with open("contant_recommend.pkl", "wb") as f:
    pickle.dump(content_recommendation, f)

In [20]:
# base_recommendation
base_rate = rate_data[['kafka_id', 'vote_average']].sort_values(by='vote_average', ascending=False)
base_recommendation = list(base_rate[:20]['kafka_id'])
with open("base_recommend.pkl", "wb") as f:
    pickle.dump(base_recommendation, f)

In [18]:
with open("user_id.pkl", "wb") as f:
    pickle.dump(user_ids, f)

In [20]:
import pandas as pd

val = pd.read_csv("train_data.csv")
val.head(5)

Unnamed: 0,user_id,title_year,user_rating,tmdb_id,title,overview,genres,cast,crew,keywords,popularity,release_date,runtime,vote_average,vote_count,age,occupation,gender
0,545008,harry+potter+and+the+half-blood+prince+2009,2,767.0,Harry Potter and the Half-Blood Prince,"As Harry begins his sixth year at Hogwarts, he...","['Adventure', 'Fantasy', 'Family']","['Daniel Radcliffe', 'Rupert Grint', 'Emma Wat...",['David Yates'],"['london,england', 'witch', 'magic', 'dyingand...",19.083723,2009-07-07,153.0,7.4,5435.0,19.0,scientist,M
1,323296,leon+the+professional+1994,4,101.0,Leon: The Professional,"Leon, the top hit man in New York, has earned ...","['Thriller', 'Crime', 'Drama']","['Jean Reno', 'Natalie Portman', 'Gary Oldman']",['Luc Besson'],"['newyorkcity', 'corruption', 'lossoflovedone'...",20.477329,1994-09-14,110.0,8.2,4293.0,28.0,college/grad student,M
2,426278,armageddon+1998,4,95.0,Armageddon,When an asteroid threatens to collide with Ear...,"['Action', 'Thriller', 'ScienceFiction', 'Adve...","['Bruce Willis', 'Billy Bob Thornton', 'Ben Af...",['Michael Bay'],"['savingtheworld', 'paris,france', 'moon', 'wa...",13.235112,1998-07-01,151.0,6.5,2540.0,28.0,scientist,M
3,358548,the+parallax+view+1974,4,17365.0,The Parallax View,An ambitious reporter gets in way-over-his-hea...,"['Crime', 'Drama', 'Thriller']","['Warren Beatty', 'Paula Prentiss', 'William D...",['Alan J. Pakula'],"['corruption', 'assassination', 'basedonnovelo...",4.43233,1974-06-14,102.0,6.9,70.0,34.0,executive/managerial,M
4,64177,x-men+days+of+future+past+2014,5,127585.0,X-Men: Days of Future Past,The ultimate X-Men ensemble fights a war for t...,"['Action', 'Adventure', 'Fantasy', 'ScienceFic...","['Hugh Jackman', 'James McAvoy', 'Michael Fass...",['Bryan Singer'],"['1970s', 'mutant', 'timetravel', 'basedoncomi...",26.058586,2014-05-15,131.0,7.5,6155.0,32.0,executive/managerial,M


In [21]:
val.dtypes

user_id           int64
title_year       object
user_rating       int64
tmdb_id         float64
title            object
overview         object
genres           object
cast             object
crew             object
keywords         object
popularity      float64
release_date     object
runtime         float64
vote_average    float64
vote_count      float64
age             float64
occupation       object
gender           object
dtype: object

In [22]:
val.shape

(127719, 18)

In [23]:
val['age'].isna().sum()

1

In [25]:
val.columns

Index(['user_id', 'title_year', 'user_rating', 'tmdb_id', 'title', 'overview',
       'genres', 'cast', 'crew', 'keywords', 'popularity', 'release_date',
       'runtime', 'vote_average', 'vote_count', 'age', 'occupation', 'gender'],
      dtype='object')

In [24]:
tmdb = pd.read_csv("../data_files/TMDB_SomewhatCleaned.csv")
tmdb.columns

Index(['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew',
       'popularity', 'runtime', 'vote_average', 'vote_count', 'release_date'],
      dtype='object')

In [11]:
import numpy as np

def find_least_euclidean_distance_indices(A, B):
    # Compute the Euclidean distance between each row in A and each row in B
    distances = np.linalg.norm(A[:, np.newaxis, :] - B, axis=2)
    # Find the indices of the rows in B with the minimum distances for each row in A
    min_indices = np.argmin(distances, axis=1)
    return min_indices


A = np.array([[1, 2], [3, 4], [5, 6]])
B = np.array([[1, 2], [2, 3], [4, 5], [5, 7]])

indices = find_least_euclidean_distance_indices(A, B)

print(indices)  # Output: [0 0 2]


[0 1 3]


In [13]:
np.array([1,2,3])[[0,2]]

array([1, 3])