In [9]:
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
credits = pd.read_csv('tmdb_5000_credits.csv')
movies = pd.read_csv('tmdb_5000_movies.csv')

In [3]:
credits.columns

Index(['movie_id', 'title', 'cast', 'crew'], dtype='object')

In [4]:
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

In [5]:
credits.columns = ['id', 'title', 'cast', 'crew']
movies = movies.merge(credits, on='id')

In [6]:
movies['overview'].head(5)

0    In the 22nd century, a paraplegic Marine is di...
1    Captain Barbossa, long believed to be dead, ha...
2    A cryptic message from Bond’s past sends him o...
3    Following the death of District Attorney Harve...
4    John Carter is a war-weary, former military ca...
Name: overview, dtype: object

In [7]:
movies['overview'] = movies['overview'].fillna('')

In [8]:
def create_join(x):
    return ''.join(x['keywords']) + '' + ''.join(x['genres']) + '' + ''.join(x['overview'])
movies['join'] = movies.apply(create_join, axis=1)

In [9]:
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrices = tfidf.fit_transform(movies['join'])
tfidf_matrices.shape

(4803, 32768)

In [10]:
cosine_similarity = linear_kernel(tfidf_matrices,tfidf_matrices)

In [11]:
indices = pd.Series(movies.index, index=credits['title']).drop_duplicates()

In [25]:
def Recommendation_get(title, cosine_similarity = cosine_similarity):
    idx = indices[title]
    similar_scores = list(enumerate(cosine_similarity[idx]))
    similar_scores = sorted(similar_scores, key = lambda x: x[1], reverse = True)
    similar_scores = similar_scores[1:21]
    movie_indices = [i[0] for i in similar_scores]
    return credits['title'].iloc[movie_indices]

In [24]:
title = input(str('Enter a movie title that you watched recently: '))
print("\t")
print("If you like that then you might like these .........")
Recommendation_get(title, cosine_similarity)

Enter a movie title that you watched recently: Skyfall
	
If you like that then you might like these .........


2                               Spectre
1343              Never Say Never Again
4339                             Dr. No
4071              From Russia with Love
3143                You Only Live Twice
3285                           Restless
3343                   Live and Let Die
2675               The Spy Who Loved Me
11                    Quantum of Solace
3251    On Her Majesty's Secret Service
1743                          Octopussy
147                     Die Another Day
1200               The Living Daylights
3336               Diamonds Are Forever
1713                 For Your Eyes Only
425                 Mission: Impossible
2428                  Brooklyn's Finest
139             Mission: Impossible III
1077                     Johnny English
3351        The Man with the Golden Gun
Name: title, dtype: object

In [1]:
# Distribution graphs (histogram/bar graph) of column data
def plotPerColumnDistribution(df, nGraphShown, nGraphPerRow):
    nunique = df.nunique()
    df = df[[col for col in df if nunique[col] > 1 and nunique[col] < 50]] # For displaying purposes, pick columns that have between 1 and 50 unique values
    nRow, nCol = df.shape
    columnNames = list(df)
    nGraphRow = (nCol + nGraphPerRow - 1) / nGraphPerRow
    plt.figure(num = None, figsize = (6 * nGraphPerRow, 8 * nGraphRow), dpi = 80, facecolor = 'w', edgecolor = 'k')
    for i in range(min(nCol, nGraphShown)):
        plt.subplot(nGraphRow, nGraphPerRow, i + 1)
        columnDf = df.iloc[:, i]
        if (not np.issubdtype(type(columnDf.iloc[0]), np.number)):
            valueCounts = columnDf.value_counts()
            valueCounts.plot.bar()
        else:
            columnDf.hist()
        plt.ylabel('counts')
        plt.xticks(rotation = 90)
        plt.title(f'{columnNames[i]} (column {i})')
    plt.tight_layout(pad = 1.0, w_pad = 1.0, h_pad = 1.0)
    plt.show()

In [2]:
# Correlation matrix
def plotCorrelationMatrix(df, graphWidth):
    filename = df.dataframeName
    df = df.dropna('columns') # drop columns with NaN
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    if df.shape[1] < 2:
        print(f'No correlation plots shown: The number of non-NaN or constant columns ({df.shape[1]}) is less than 2')
        return
    corr = df.corr()
    plt.figure(num=None, figsize=(graphWidth, graphWidth), dpi=80, facecolor='w', edgecolor='k')
    corrMat = plt.matshow(corr, fignum = 1)
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
    plt.yticks(range(len(corr.columns)), corr.columns)
    plt.gca().xaxis.tick_bottom()
    plt.colorbar(corrMat)
    plt.title(f'Correlation Matrix for {filename}', fontsize=15)
    plt.show()

In [3]:
# Scatter and density plots
def plotScatterMatrix(df, plotSize, textSize):
    df = df.select_dtypes(include =[np.number]) # keep only numerical columns
    # Remove rows and columns that would lead to df being singular
    df = df.dropna('columns')
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    columnNames = list(df)
    if len(columnNames) > 10: # reduce the number of columns for matrix inversion of kernel density plots
        columnNames = columnNames[:10]
    df = df[columnNames]
    ax = pd.plotting.scatter_matrix(df, alpha=0.75, figsize=[plotSize, plotSize], diagonal='kde')
    corrs = df.corr().values
    for i, j in zip(*plt.np.triu_indices_from(ax, k = 1)):
        ax[i, j].annotate('Corr. coef = %.3f' % corrs[i, j], (0.8, 0.2), xycoords='axes fraction', ha='center', va='center', size=textSize)
    plt.suptitle('Scatter and Density Plot')
    plt.show()

In [15]:
#plot the top twenty (you can top whatever you want) movies as bar plot 
topten_scores=m_movies['score'].head(20) 
#get the titles 
topten_titles=m_movies['title'].head(20) 
#plot the top twenty movies 
sb.set_style('whitegrid')
plt.figure(figsize=(12,8))
plt.barh(topten_titles,topten_scores, align='center',color='#C6870A')
plt.gca().invert_yaxis()
plt.xlabel(" Movie Scores")
plt.title("Top Ten Movies")

NameError: name 'm_movies' is not defined