In [1]:
import pandas as pd
import numpy as np

import ast
import nltk
import ssl
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import jaccard_score

from create_sample import SampleData



from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, expr, split
from pyspark.sql.types import FloatType, StringType
from pyspark.sql.functions import col

In [2]:
# Download necessary nltk packages for language processing
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/andresrivera/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/andresrivera/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/andresrivera/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
movies_df = pd.read_csv("../data/final_movie.csv")
sample_obj = SampleData(movies_df)

sample_obj.create_sample(sample_size=2500)
sample = sample_obj.process_sample()

In [7]:
sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 2500 non-null   int64  
 1   keywords           2500 non-null   object 
 2   cast               2500 non-null   object 
 3   genres             2500 non-null   object 
 4   imdb_id            2500 non-null   object 
 5   original_language  2500 non-null   object 
 6   revenue            2500 non-null   float64
 7   release_date       2500 non-null   int64  
 8   spoken_languages   2500 non-null   object 
 9   title              2500 non-null   object 
 10  vote_average       2500 non-null   float64
 11  vote_count         2500 non-null   float64
 12  overview           2500 non-null   object 
 13  poster_path        2500 non-null   object 
 14  popularity         2500 non-null   float64
 15  Director           2500 non-null   object 
dtypes: float64(4), int64(2),

In [29]:


sample['cast'].fillna("", inplace=True) # replace missing values with empty string
sample['keywords'].fillna("", inplace=True) # replace missing values with empty string


sample['cast'].fillna("", inplace=True)
sample['overview'].fillna("", inplace=True)



In [7]:
sample.reset_index(inplace=True)
sample.drop(['index'], axis=1, inplace=True)

In [9]:
# Processing text data
def preprocess_overview(text):

    if not isinstance(text, str):
        return []
    
    # tokenize the text
    tokens = word_tokenize(text)

     # Convert to lowercase and remove non-alphanumeric characters
    words = [word.lower() for word in tokens if word.isalnum()]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]
    
    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    
    return lemmatized_words

In [8]:
def extract_keywords(row):
    keywords = str(row['keywords'])
    
    
    directors = ' '.join(ast.literal_eval(row['Director']))
    
    overview = row['overview']
    overview_keywords = ' '.join(preprocess_overview(overview))

    genres = ' '.join(ast.literal_eval(row['genres'])).lower()

    year = str(row['release_date'])

    cast = row['cast']


    
    combined_keywords = ' '.join([keywords, year, overview_keywords, directors, cast])
    
    return combined_keywords


In [10]:
sample

Unnamed: 0,id,keywords,cast,genres,imdb_id,original_language,revenue,release_date,spoken_languages,title,vote_average,vote_count,overview,poster_path,popularity,Director
0,382597,,AlicePol DanyBoon MichelBlanc YvanAttal Patric...,"['Action', 'Comedy']",tt5736696,fr,0.0,2017,"[{'iso_639_1': 'fr', 'name': 'Français'}]",R.A.I.D. Special Unit,5.8,207.0,The story of a woman who dreams to join an int...,/vkIWKRIr5zTMFH0zEi2Y0ZyWbfO.jpg,7.538272,['Dany Boon']
1,11347,temple vampire dangerous worm derbyshire archa...,AmandaDonohoe HughGrant CatherineOxenberg Pete...,"['Comedy', 'Horror']",tt0095488,en,1189315.0,1988,"[{'iso_639_1': 'en', 'name': 'English'}]",The Lair of the White Worm,5.8,59.0,"In a remote corner of England's Peak District,...",/sSNQFYOHkey0Yke2qR3y1Y7KX6e.jpg,9.974660,['Ken Russell']
2,8902,brotherbrotherrelationship sociallydeprivedfam...,SandraCorveloni ViníciusdeOliveira AnaLuizaGar...,['Drama'],tt0803029,pt,0.0,2008,"[{'iso_639_1': 'pt', 'name': 'Português'}]",Linha de Passe,6.3,10.0,"In the periphery of São Paulo, the pregnant si...",/xKx5msZOhp71R7w9LPudDfrFQsz.jpg,0.384534,"['Walter Salles', 'Daniela Thomas']"
3,77946,rescue dog animalactor boyanddog heartwarming,ZacharyBrowne MichaelMoriarty AnnDowd ScottWilson,"['Drama', 'Family']",tt0175159,en,0.0,1999,"[{'iso_639_1': 'en', 'name': 'English'}]",Shiloh 2: Shiloh Season,6.2,6.0,After young Marty Peterson rescued Shiloh from...,/kCf8KXFZzICCPXwx7v1IivJqXz5.jpg,0.530506,['Sandy Tung']
4,50715,allnighter wigancasino,MartinCompston FelicityJones AlfieAllen Nichol...,"['Comedy', 'Drama', 'Music']",tt1259227,en,0.0,2010,"[{'iso_639_1': 'en', 'name': 'English'}]",SoulBoy,6.3,13.0,"1974. Amidst power cuts, strikes and boot-boy ...",/qrWxs48zYkNXtTJhZne61Bnk9l6.jpg,3.569699,['Shimmy Marcus']
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,27554,drugaddiction love betrayal deathofpet needle,AlPacino KittyWinn AlanVint RichardBright Kiel...,"['Drama', 'Romance']",tt0067549,en,0.0,1971,"[{'iso_639_1': 'en', 'name': 'English'}]",The Panic in Needle Park,6.5,74.0,This movie is a stark portrayal of life among ...,/7k45YTnFJXVX12LRaewjWVoBGUa.jpg,8.802676,['Jerry Schatzberg']
2496,64832,,JamesFranco JulianneNicholson JoshLucas NinaAr...,"['Drama', 'Action', 'Romance']",tt1453403,en,0.0,2010,"[{'iso_639_1': 'en', 'name': 'English'}]",Shadows & Lies,4.8,7.0,The story of William Vincent as he recounts th...,/AsV9EhrvB2qoRl24YgmLqQQtCbr.jpg,1.408602,['Jay Anania']
2497,55135,smokingmarijuana neighborhood belgrade goalkeeper,SergejTrifunović MarijaKaran NebojšaGlogovac B...,['Comedy'],tt0383846,sr,0.0,2004,"[{'iso_639_1': 'sr', 'name': 'Srpski'}]","When I Grow Up, I'll Be a Kangaroo",9.0,6.0,The film consists of three parallel stories th...,/boZZ0uKdvyhobtSlec5v8TPBc6r.jpg,0.508196,"[""Radivoje 'Raša' Andrić""]"
2498,9404,helicopter martialarts australia,JackieChan BillTung JacksonLiu AnnieWu AilenSi...,"['Action', 'Adventure', 'Comedy', 'Crime', 'Dr...",tt0116704,cn,21890845.0,1996,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",First Strike,6.5,101.0,Jackie Chan reprises his role as Chan Ka-Kui (...,/9i6bhYbxe2g02e3GhljtktuyDMj.jpg,9.206356,['Stanley Tong']


In [11]:

df_movies = pd.DataFrame()


sample['combined_keywords'] = sample.apply(extract_keywords, axis=1)
sample['title'] = sample['title']
sample['genres'] = sample['genres']

In [12]:
sample['genres_set'] = sample.genres.apply(lambda genres: set(ast.literal_eval(genres)))

In [13]:
sample["genres_set_str"] = sample["genres_set"].apply(lambda x: ' '.join(x))
sample.drop(['genres_set'], axis=1, inplace=True)

In [14]:
sample["movie_index"] = sample.index


In [12]:
spark = SparkSession.builder.master("local[*]").appName("MovieRecommendation").getOrCreate()
spark_movies = spark.createDataFrame(sample)



Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/04/19 11:45:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/04/19 11:45:30 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [13]:
spark.stop()

In [14]:
import sys
print(sys.executable)

/usr/local/bin/python3


In [16]:
# Define the Jaccard similarity function as a UDF
@udf(returnType=FloatType())
def jaccard_similarity_udf(set1_str, set2_str):
    set1 = set(set1_str.split())
    set2 = set(set2_str.split())
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0




# Calculate the Jaccard similarity matrix using a Cartesian join
spark_movies_with_similarity = (
    spark_movies.alias("movies1")
    .crossJoin(spark_movies.alias("movies2"))
    .select(
        col("movies1.movie_index").alias("movie1_index"),
        col("movies2.movie_index").alias("movie2_index"),
        jaccard_similarity_udf("movies1.genres_set_str", "movies2.genres_set_str").alias("jaccard_similarity"),
    )
)


In [17]:
spark_movies_with_similarity.show()

[Stage 0:>                                                          (0 + 1) / 1]

+------------+------------+------------------+
|movie1_index|movie2_index|jaccard_similarity|
+------------+------------+------------------+
|           0|           0|               1.0|
|           0|           1|               0.0|
|           0|           2|               0.0|
|           0|           3|             0.125|
|           0|           4|               0.0|
|           0|           5|              0.25|
|           0|           6|               0.0|
|           0|           7|               0.0|
|           0|           8|               0.0|
|           0|           9|               0.2|
|           0|          10|               0.0|
|           0|          11|               0.0|
|           0|          12|               0.0|
|           0|          13|        0.16666667|
|           0|          14|             0.125|
|           0|          15|               0.0|
|           0|          16|               0.0|
|           0|          17|               0.0|
|           0

                                                                                

In [18]:

pd_movies_with_similarity = spark_movies_with_similarity.toPandas()


jaccard_similarity_matrix = pd_movies_with_similarity.pivot_table(
    index="movie1_index",
    columns="movie2_index",
    values="jaccard_similarity"
).values


                                                                                

In [79]:
jaccard_similarity_matrix.shape

(1000, 1000)

In [19]:
stop = list(stopwords.words('english'))

tfidf_vectorizer = TfidfVectorizer(analyzer = 'word', stop_words=list(set(stop)))
tfidf_matrix = tfidf_vectorizer.fit_transform(sample['combined_keywords'])

cosine_sim = cosine_similarity(tfidf_matrix)

In [38]:
cosine_sim.shape

(1000, 1000)

In [20]:
# Define weighting for cosine and jaccard similarity scores

jaccard_weight = 0.25
cosine_weight = 1 - jaccard_weight

combined_matrix = (jaccard_weight * jaccard_similarity_matrix) + (cosine_weight * cosine_sim)

In [21]:
combined_matrix

array([[1.        , 0.        , 0.        , ..., 0.04166667, 0.04166667,
        0.04166667],
       [0.        , 1.        , 0.00262346, ..., 0.01747618, 0.00607356,
        0.09456949],
       [0.        , 0.00262346, 1.        , ..., 0.05      , 0.        ,
        0.00287455],
       ...,
       [0.04166667, 0.01747618, 0.05      , ..., 1.        , 0.00900217,
        0.        ],
       [0.04166667, 0.00607356, 0.        , ..., 0.00900217, 1.        ,
        0.05784787],
       [0.04166667, 0.09456949, 0.00287455, ..., 0.        , 0.05784787,
        1.        ]])

In [23]:
sample.title

0           The Singing Detective
1                 Dear Lemon Lima
2               The Expendables 2
3                  Action Jackson
4        The Forgiveness of Blood
                  ...            
1995                      2000 AD
1996                          DNA
1997           Rustlers' Rhapsody
1998    Scooby-Doo! Frankencreepy
1999                         Boys
Name: title, Length: 2000, dtype: object

In [22]:
def recommend_movies(title, num_recommendations=10):
    # Get the index of the movie with the given title
    movie_index = sample[sample['title'] == title].index[0]

    # Get the list of cosine similarity scores for the given movie
    similarity_scores = list(enumerate(combined_matrix[movie_index]))

    # Sort the scores in descending order
    similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of the most similar movies (excluding the input movie itself)
    similar_movie_indices = [score[0] for score in similarity_scores_sorted[1:num_recommendations + 1]]

    # Return the titles of the most similar movies
    return sample['title'].iloc[similar_movie_indices]

In [47]:
movie_index = sample[sample['title'] == "Hannibal Rising"].index[0]

movie_index

995

In [None]:
sample[sample['release_date'] == '2013']

In [25]:
recommended_movies = recommend_movies("Scooby-Doo! Frankencreepy", 10)
print(recommended_movies)

1539                    Scooby-Doo! Pirates Ahoy!
276           Barbie & Her Sisters in A Pony Tale
216                                          Ozzy
170                                  The Gruffalo
283                      Barbie in the Nutcracker
669                                     Air Mater
1421                     Tom and Jerry: The Movie
1556                       Pooh's Heffalump Movie
442     The Land Before Time VIII: The Big Freeze
471                         Monster High: Haunted
Name: title, dtype: object


In [154]:
mat = np.matrix(combined_matrix)

mat_df = pd.DataFrame(data=mat.astype(float))

In [156]:
mat_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
0,1.000000,0.045370,0.046169,0.00000,0.004542,0.000000,0.041667,0.050000,0.006731,0.007784,...,0.055803,0.050000,0.035714,0.045236,0.074890,0.000000,0.006462,0.044794,0.000000,0.000000
1,0.045370,1.000000,0.056569,0.00000,0.000000,0.000000,0.054084,0.062500,0.009307,0.050000,...,0.175092,0.076976,0.041667,0.125000,0.083333,0.007322,0.000000,0.125000,0.077099,0.032494
2,0.046169,0.056569,1.000000,0.00000,0.000000,0.007144,0.003006,0.000000,0.000000,0.129733,...,0.062500,0.005018,0.100000,0.010722,0.000000,0.008301,0.000000,0.125000,0.062500,0.009268
3,0.000000,0.000000,0.000000,1.00000,0.250000,0.000000,0.086890,0.004279,0.000000,0.083333,...,0.029948,0.000000,0.062500,0.000000,0.000000,0.125000,0.250000,0.000000,0.125000,0.088540
4,0.004542,0.000000,0.000000,0.25000,1.000000,0.000000,0.083333,0.000000,0.000000,0.083333,...,0.000000,0.000000,0.062500,0.000000,0.000000,0.132516,0.250000,0.000000,0.130717,0.062500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0.000000,0.007322,0.008301,0.12500,0.132516,0.000000,0.071638,0.000000,0.000000,0.062500,...,0.010550,0.009993,0.058309,0.000000,0.000000,1.000000,0.147623,0.000000,0.088772,0.132897
1996,0.006462,0.000000,0.000000,0.25000,0.250000,0.000000,0.088472,0.000000,0.000000,0.083333,...,0.014703,0.000000,0.067903,0.000000,0.000000,0.147623,1.000000,0.000000,0.125000,0.062500
1997,0.044794,0.125000,0.125000,0.00000,0.000000,0.000000,0.050000,0.062500,0.005808,0.131098,...,0.166667,0.065475,0.041667,0.085107,0.087883,0.000000,0.000000,1.000000,0.062500,0.000000
1998,0.000000,0.077099,0.062500,0.12500,0.130717,0.000000,0.064963,0.010671,0.000000,0.166667,...,0.089592,0.000000,0.125000,0.000000,0.005415,0.088772,0.125000,0.062500,1.000000,0.057734


In [166]:
mat_df.to_csv('graph_db/data/scores_matrix.csv', sep=' ', header=True, index=False)
sample.to_csv("graph_db/data/sample.csv")




# Also add data to the movie_app directory
mat_df.to_csv('movie_app/data/scores_matrix.csv', sep=' ', header=True, index=False)
sample.to_csv("movie_app/data/sample.csv")
