# **MOVIE RECOMMENDATION SYSTEM** 

**SIMILAR MOVIES:**

In [None]:
import pandas as pd
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score,classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB




def get_similar_movies(movie_df, title, indices, cos_sim):
    curr_index = indices[title]

    sim_scores = list(enumerate(cos_sim[curr_index]))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    sim_scores = sim_scores[1:11]

    movie_indices = [i[0] for i in sim_scores]

    return movie_df['title'].iloc[movie_indices]


def main():
    movies_df = pd.read_csv('/content/tmdb_5000_movies.csv')
    movies_df.head(10)

    #Summary of the dataset
    movies_df.describe()

    # Remove stop words from the 'overview'
    tf_idf_vectorizer_object = TfidfVectorizer(stop_words='english')

    # Replace NaN string with empty string
    movies_df['overview'] = movies_df['overview'].fillna('')

    # Construct the TF_IDF matrix
    tf_idf_matrix = tf_idf_vectorizer_object.fit_transform(movies_df['overview'])
    #print(tf_idf_matrix)

    # Calculate the cosine similarity
    cos_sim = cosine_similarity(tf_idf_matrix, tf_idf_matrix)

    # Construct a map from title to index
    title_to_index = pd.Series(movies_df.index, index=movies_df['title']).drop_duplicates()
    #print(title_to_index)

    search_movie_title = 'Kids'

    print('Movies similar to ' + search_movie_title + ' are:')
    print(get_similar_movies(movies_df, search_movie_title, title_to_index, cos_sim))

    

if __name__ == "__main__":
    main()


Movies similar to Raising Cain are:
1254                      Get Carter
4217                            Kids
4161     The Marine 4: Moving Target
1626          My Super Ex-Girlfriend
4                        John Carter
3563           Paranormal Activity 4
231                   Monsters, Inc.
2854    Def Jam's How to Be a Player
2375                Midnight Special
1307                   The Hurricane
Name: title, dtype: object


**TOP 10 MOVIES :**

In [None]:
import pandas as pd
import numpy as np

minimum_votes = 0
mean_rating = 0


def score(df, m=minimum_votes, c=mean_rating):
    v = df['vote_count']
    r = df['vote_average']

    return ((v / v + m) * r) + ((m / v + m) * c)


def main():
    movies_df = pd.read_csv('/content/tmdb_5000_movies.csv')

    # Mean rating of all movies
    mean_rating = movies_df['vote_average'].mean()

    # Minimum votes should be at least  75th percentile
    minimum_votes = movies_df['vote_count'].quantile(0.75)

    top_movies = movies_df.copy().loc[movies_df['vote_count'] >= minimum_votes]

    # axis=1, use the index as the dataframe's column
    top_movies['score'] = top_movies.apply(score, axis=1)

    top_movies = top_movies.sort_values('score', ascending=False)

    print("The list of top 10 movies are:"+"\n"+str(top_movies[['original_title', 'score', ]].head(10)))


if __name__ == "__main__":
    main()


The list of top 10 movies are:
                original_title  score
1881  The Shawshank Redemption    8.5
3337             The Godfather    8.4
2294                  千と千尋の神隠し    8.3
3865                  Whiplash    8.3
2731    The Godfather: Part II    8.3
3232              Pulp Fiction    8.3
1818          Schindler's List    8.3
662                 Fight Club    8.3
2170                    Psycho    8.2
1847                GoodFellas    8.2


**MODEL FITTING:**

In [None]:
    movies_df = pd.read_csv('/content/tmdb_5000_movies.csv')
    #movies_df.head(10)
    movies_df ["original_language"]= movies_df ["original_language"].replace("en", "0")
    movies_df ["original_language"]= movies_df ["original_language"].replace("fr", "1")
    #print(movies_df)

    ###scaling the dataset!!!
    X = movies_df.iloc[:,18:19]
    y = movies_df.iloc[:,19]

    ###Splitting the dataset into the Train set and Test set!!!
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
    
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    #print(X_train)
    #print(X_test)

    
    ###fitting the model!!!
    ###RANDOM FOREST
    classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    
    c_matrix = confusion_matrix(y_test, y_pred)
    #print(c_matrix)
    print('Classification Report Of Random Forest:\n', classification_report(y_test, y_pred))
    #print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))
    print('\n Model Accuracy for Random Forest: \n \t', accuracy_score(y_test, y_pred)*100)
    print("\n Training Accuracy: \n \t", classifier.score(X_train, y_train)*100)
    print("\n Testing Accuracy: \n \t", classifier.score(X_test, y_test)*100)
  
    ###NAIVE BAYES
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
 
    #print('\nTotal number of examples are: ', len(X))
    #print('\nOut of these, training examples are: ', len(X_train))
    #print("\nTest examples are: ", len(X_test))
 
    classifier = GaussianNB()
    classifier.fit(X_train, y_train)
    #y_train = y_train.reshape(-1,1)
    y_pred = classifier.predict(X_test)
    
    c_matrix = confusion_matrix(y_test, y_pred)
    #print(c_matrix)
    print('\n Classification Report Of Naive Bayes:\n', classification_report(y_test, y_pred))
    #print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))
    print('\n Model Accuracy for Naive Bayes:\n \t', accuracy_score(y_test, y_pred)*100)
    print("\n Training Accuracy:\n \t", classifier.score(X_train, y_train)*100)
    print("\n Testing Accuracy:\n \t", classifier.score(X_test, y_test)*100)

Classification Report Of Random Forest:
               precision    recall  f1-score   support

           0       0.95      0.98      0.97      1139
           1       0.05      0.02      0.02        62

    accuracy                           0.93      1201
   macro avg       0.50      0.50      0.49      1201
weighted avg       0.90      0.93      0.92      1201


 Model Accuracy for Random Forest: 
 	 93.255620316403

 Training Accuracy: 
 	 93.75347029428096

 Testing Accuracy: 
 	 93.255620316403

 Classification Report Of Naive Bayes:
               precision    recall  f1-score   support

           0       0.95      1.00      0.97      1363
           1       0.00      0.00      0.00        78

    accuracy                           0.95      1441
   macro avg       0.47      0.50      0.49      1441
weighted avg       0.89      0.95      0.92      1441


 Model Accuracy for Naive Bayes:
 	 94.58709229701596

 Training Accuracy:
 	 93.48602022605593

 Testing Accuracy:
 	 94.58

  _warn_prf(average, modifier, msg_start, len(result))
