In [2]:
import pandas as pd
import numpy as np
import os
import sklearn 
import requests

In [3]:
df_details = pd.read_csv("movies_details.csv")
df_credits = pd.read_csv("movies_credits.csv")

In [4]:
df_details.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10003 entries, 0 to 10002
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  10003 non-null  bool   
 1   backdrop_path          9992 non-null   object 
 2   belongs_to_collection  2495 non-null   object 
 3   budget                 10003 non-null  int64  
 4   genres                 10003 non-null  object 
 5   homepage               4335 non-null   object 
 6   id                     10003 non-null  int64  
 7   imdb_id                9996 non-null   object 
 8   origin_country         10003 non-null  object 
 9   original_language      10003 non-null  object 
 10  original_title         10003 non-null  object 
 11  overview               10001 non-null  object 
 12  popularity             10003 non-null  float64
 13  poster_path            9997 non-null   object 
 14  production_companies   10003 non-null  object 
 15  pr

In [5]:
df_credits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10003 entries, 0 to 10002
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      10003 non-null  int64 
 1   cast    10003 non-null  object
 2   crew    10003 non-null  object
dtypes: int64(1), object(2)
memory usage: 234.6+ KB


### Creating new dataframe from important columns

In [6]:
# combining two dataframe together
new_df = pd.merge(df_details, df_credits, on = 'id')

In [7]:
new_df.columns

Index(['adult', 'backdrop_path', 'belongs_to_collection', 'budget', 'genres',
       'homepage', 'id', 'imdb_id', 'origin_country', 'original_language',
       'original_title', 'overview', 'popularity', 'poster_path',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'video', 'vote_average', 'vote_count', 'cast', 'crew'],
      dtype='object')

In [8]:
# New dataframe from given columns
movies = new_df[['id', 'original_language', 'tagline', 'title', 'genres', 'overview', 'production_companies', 'production_countries', 'cast', 'crew']].copy()

In [9]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10055 entries, 0 to 10054
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   id                    10055 non-null  int64 
 1   original_language     10055 non-null  object
 2   tagline               8446 non-null   object
 3   title                 10055 non-null  object
 4   genres                10055 non-null  object
 5   overview              10053 non-null  object
 6   production_companies  10055 non-null  object
 7   production_countries  10055 non-null  object
 8   cast                  10055 non-null  object
 9   crew                  10055 non-null  object
dtypes: int64(1), object(9)
memory usage: 785.7+ KB


## Cleaning The Data

In [10]:
movies.isnull().sum()

id                         0
original_language          0
tagline                 1609
title                      0
genres                     0
overview                   2
production_companies       0
production_countries       0
cast                       0
crew                       0
dtype: int64

In [11]:
movies.dropna(inplace=True)

In [12]:
movies = movies.reset_index(drop=True)

In [13]:
movies['id'].duplicated().sum()

75

In [14]:
movies['id'] = movies['id'].drop_duplicates()

In [15]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8445 entries, 0 to 8444
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    8370 non-null   float64
 1   original_language     8445 non-null   object 
 2   tagline               8445 non-null   object 
 3   title                 8445 non-null   object 
 4   genres                8445 non-null   object 
 5   overview              8445 non-null   object 
 6   production_companies  8445 non-null   object 
 7   production_countries  8445 non-null   object 
 8   cast                  8445 non-null   object 
 9   crew                  8445 non-null   object 
dtypes: float64(1), object(9)
memory usage: 659.9+ KB


## Now Fetching data from string list and dict

In [16]:
# This function will fetch the data from some coulmns 
import ast

def fetchname(obj):
    try:
        list = []
        data = ast.literal_eval(obj)
        #if isinstance(data, dict) and "name" in data:
        for i in data:
            list.append(i.get("name"))
            #list.append(i["name"])
    except (KeyError, TypeError):
        return None
    return list


#### Fetching Genres of each movies

In [17]:
fetchname("[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name': 'Action'}, {'id': 80, 'name': 'Crime'}, {'id': 53, 'name': 'Thriller'}]")

['Drama', 'Action', 'Crime', 'Thriller']

In [18]:
movies['genres'] = movies['genres'].apply(fetchname)

#### Fetching Production Companies Names

In [19]:
movies['production_countries'][0]

"[{'iso_3166_1': 'GB', 'name': 'United Kingdom'}, {'iso_3166_1': 'US', 'name': 'United States of America'}]"

In [20]:
movies['production_companies'].apply(fetchname)

0       [Legendary Pictures, Syncopy, Warner Bros. Pic...
1       [Legendary Pictures, Syncopy, Lynda Obst Produ...
2       [DC, Legendary Pictures, Syncopy, Isobel Griff...
3       [Dune Entertainment, Lightstorm Entertainment,...
4       [20th Century Fox, The Donners' Company, Genre...
                              ...                        
8440      [Gaumont, Kairos, Canal+, TF1 Films Production]
8441    [Pathé Entertainment, Ufland, Metro-Goldwyn-Ma...
8442    [Relativity Media, Phoenix Pictures, Rising St...
8443    [Watchout Studio, TVN Group, Next Film, ITI Ne...
8444                [Indigo Film, France 2 Cinéma, Pathé]
Name: production_companies, Length: 8445, dtype: object

In [21]:
fetchname(movies['production_countries'][0])

['United Kingdom', 'United States of America']

In [22]:
movies['production_countries'] = movies['production_countries'].apply(fetchname)



In [23]:
movies['production_companies'] = movies['production_companies'].apply(fetchname)

In [24]:

def director_name(crew):
    list = []
    for crew in ast.literal_eval(crew):
        if crew['job'] == 'Director':
            dict_of_director = crew['name']
            #print(i['job'] == 'Director')
            list.append(dict_of_director)
    return list

In [25]:
director_name(movies['crew'][0])

['Christopher Nolan']

In [26]:
movies['crew'] = movies['crew'].apply(director_name)

### Fetching Actors Names

In [27]:
def fetch_actor(cast):
    name = []
    count = 0
    for actor in ast.literal_eval(cast):
        if count <= 3:
            name.append(actor['name'])
            count += 1
        else:
            break
    return name

In [28]:
fetch_actor(movies['cast'][0])

['Leonardo DiCaprio', 'Joseph Gordon-Levitt', 'Ken Watanabe', 'Tom Hardy']

In [29]:
movies['cast'] = movies['cast'].apply(fetch_actor)

### Transformation of columns to lists

In [30]:
# Transform columns from string to lists
split_columns = ['original_language', 'tagline', 'overview']
for i in split_columns:
    movies[i] = movies[i].apply(lambda x: x.split())

### Removing Spaces Between Each Words

In [31]:
# Remove all spaces between words
spaces_columns = ['crew', 'genres', 'production_companies', 'cast']
for col in spaces_columns:
    movies[col] = movies[col].map(lambda x: [i.replace(' ', '') for i in x])

***Making new columns to combine all the columns in one single column***

In [32]:
movies['tags'] = movies['tagline'] + movies['genres'] + movies['overview'] + movies['original_language'] + movies['production_companies'] + movies['production_countries'] + movies['cast'] + movies['crew']

In [33]:
# removing extra columns
movies = movies[['id', 'title', 'tags']]

In [34]:
movies.head()

Unnamed: 0,id,title,tags
0,27205.0,Inception,"[Your, mind, is, the, scene, of, the, crime., ..."
1,157336.0,Interstellar,"[Mankind, was, born, on, Earth., It, was, neve..."
2,155.0,The Dark Knight,"[Welcome, to, a, world, without, rules., Drama..."
3,19995.0,Avatar,"[Enter, the, world, of, Pandora., Action, Adve..."
4,293660.0,Deadpool,"[Feel, the, love., Action, Adventure, Comedy, ..."


***Now we are going to apply some other import methods such as stemming and victorization and so on to prepate it and change the text into lower case***

## Stemming

In [35]:
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords 

stop_words = stopwords.words('english')
stemmer = PorterStemmer()

def feature_engg(text):
    """
    This function takes a text as input, transforms it into lowercase, and applies stemming. 
    Returns the transformed text as a string.
    we can also use the stopwords function to but we can do this easily while vicotorization
    """
    
    # convert upper case letter to lowercase 
    text = [i.lower() for i in text]
    
    # Applying stemming to words
    word_stemming = []
    for word in text:
        word_stemming.append(stemmer.stem(word))
    
    # Joining the words back into a string
    return " ".join(word_stemming)

In [36]:
movies['tags'].apply(feature_engg)

0       your mind is the scene of the crime. action sc...
1       mankind wa born on earth. it wa never meant to...
2       welcom to a world without rules. drama action ...
3       enter the world of pandora. action adventur fa...
4       feel the love. action adventur comedi the orig...
                              ...                        
8440    if you can't rememb your past...you can't save...
8441    in 1984, betti mahmoody' husband took hi wife ...
8442    time bring all thing to light. drama thriller ...
8443    time change. the stake remain the same drama t...
8444    everyth is not enough. drama intern releas dir...
Name: tags, Length: 8445, dtype: object

In [37]:
movies['tags'] = movies['tags'].apply(feature_engg)

## Victoriazations

In [38]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=7000, stop_words='english')

In [39]:
vectors = cv.fit_transform(movies['tags']).toarray()

In [40]:
cv.get_feature_names_out()[10:50]

array(['15', '16', '17', '17th', '18', '18th', '19', '1920', '1920s',
       '1930', '1940', '1942', '1943', '1944', '1945', '1950', '1950s',
       '1959', '1960', '1960s', '1962', '1964', '1967', '1970', '1970s',
       '1972', '1973', '1974', '1976', '1978', '1980', '1980s', '1984',
       '1987', '1988', '1990', '1991', '1999', '19th', '20'], dtype=object)

In [41]:
vectors.shape

(8445, 7000)

## Cosine similarity

In [42]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors)

In [43]:
similarity.shape

(8445, 8445)

### Functiona for similarity Score

In [44]:
indx  = movies[movies['title'] == 'Avatar'].index
movies.iloc[indx]

Unnamed: 0,id,title,tags
3,19995.0,Avatar,enter the world of pandora. action adventur fa...


In [45]:
sorted(list(enumerate(similarity[1])), reverse = True, key = lambda x: x[1])[1:6]

[(6695, 0.36329950727537746),
 (132, 0.3583914681524164),
 (648, 0.34806170919108287),
 (2711, 0.3442651863295482),
 (6648, 0.3431001569771103)]

In [46]:
movies.iloc[6695]['title']

'Space Chimps'

In [47]:
def similarity_score(title):
    indx = movies[movies['title'] ==  title].index
    similar_array = similarity[indx][0]
    fivesimilar_array = sorted(list(enumerate(similar_array)), reverse = True, key = lambda x: x[1])[1:6]
    
    for i in fivesimilar_array:
        five_movies = movies.iloc[i[0]]['title']
        print(five_movies)
    

In [48]:
similarity_score('Avatar')

Avatar: The Way of Water
Skylines
Krull
MirrorMask
Aliens in the Attic


## Saving Model in pickle

In [52]:
import pickle
with open('recommendation_system.pkl', 'wb') as f:
    pickle.dump(similarity, f)

In [51]:
with open("recommendation_data.pkl", 'wb') as f:
    pickle.dump(movies, f)