In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data_location = "/content/drive/MyDrive/Copy of combinedNetflixData.txt"

In [None]:
df = pd.read_csv(data_location, names = ["CustID", "Ratings"], usecols = [0, 1], )

In [None]:
df.head()

Unnamed: 0,CustID,Ratings
0,1:,
1,1488844,3.0
2,822109,5.0
3,885013,4.0
4,30878,4.0


In [None]:
print(f"Rows: {df.shape[0]} and Columns: {df.shape[1]}")

Rows: 24058263 and Columns: 2


**How many movies we are dealing with in total?**

In [None]:
total_count_of_movies = df.isnull().sum()["Ratings"]
print("Total_number_of_movie are:",total_count_of_movies)

Total_number_of_movie are: 4499


In [None]:
df_copy = df.copy()

In [None]:
curr_movie = None

movie_ids=[]

**Separating movie id from Cust_ID**

In [None]:
#Loop for iterating over CustID column, to replace ":" with numbers
for cust_id in df_copy["CustID"]:
  # We will put the condition for checking the "x:"
  if ":" in cust_id:
    # We will try to replace : with nothing so that we can get a particular integer values
    curr_movie = int(cust_id.replace(":", ""))
  # We will take this movie and map with the data
  movie_ids.append(curr_movie)

# This lines will help us mapping data to ratings
df_copy["MovieID"] = movie_ids

# Removal of the unneccsary data
df_copy = df_copy[df_copy["Ratings"].notna()]

In [None]:
df_copy.tail()

Unnamed: 0,CustID,Ratings,MovieID
24058258,2591364,2.0,4499
24058259,1791000,2.0,4499
24058260,512536,5.0,4499
24058261,988963,3.0,4499
24058262,1704416,3.0,4499


In [None]:
df_copy.shape

(24053764, 3)

Division of Rating

In [None]:
df_copy["Ratings"].value_counts()

Unnamed: 0_level_0,count
Ratings,Unnamed: 1_level_1
4.0,8085741
3.0,6904181
5.0,5506583
2.0,2439073
1.0,1118186


# **Preparation for Collabrative Filtering**

* **We cannot remove duplicates for any column since the requirement is completely based on the ratings of the users given, hence we will go ahead with the benchmarks**

* **Here there is a need of two benchmarks in the dataset**
  * **Customers who are not frequently giving rating (non active users or maybe fake or dummy users). These are users that watch movies but don't give ratings, so we can remove them**
  * **Those movies that has less ratings are possibly not much popular so will not recommend and remove them from the list**

In [None]:
df=df_copy

In [None]:
df.head()

Unnamed: 0,CustID,Ratings,MovieID
1,1488844,3.0,1
2,822109,5.0,1
3,885013,4.0,1
4,30878,4.0,1
5,823519,3.0,1


In [None]:
Movie_list=df.groupby("MovieID")["Ratings"].agg(["count"]).reset_index()

In [None]:
Movie_list

Unnamed: 0,MovieID,count
0,1,547
1,2,145
2,3,2012
3,4,142
4,5,1140
...,...,...
4494,4495,614
4495,4496,9519
4496,4497,714
4497,4498,269


**Threshold for setting the movie as good movie or to be considered**

In [None]:
Movie_list["count"].quantile(0.7)

1798.6

**Benchmark for movies**

In [None]:
benchmark_movie=round(Movie_list["count"].quantile(0.7),0)
benchmark_movie

1799.0

In [None]:
drop_Movie_list=Movie_list[Movie_list["count"]<benchmark_movie].index

In [None]:
drop_Movie_list

Index([   0,    1,    3,    4,    5,    6,    8,    9,   10,   11,
       ...
       4483, 4485, 4486, 4488, 4490, 4493, 4494, 4496, 4497, 4498],
      dtype='int64', length=3149)

In [None]:
Cust_list=df.groupby("CustID")["Ratings"].agg(["count"]).reset_index()

**BenchMark for Customers**

In [None]:
benchmark_cust=round(Cust_list["count"].quantile(0.7),0)
benchmark_cust

52.0

In [None]:
drop_cust_list=Cust_list[Cust_list["count"]<benchmark_cust].index

In [None]:
drop_cust_list

Index([     0,      1,      2,      4,      5,      6,      7,      8,     10,
           11,
       ...
       470747, 470748, 470749, 470751, 470752, 470753, 470754, 470755, 470756,
       470757],
      dtype='int64', length=327300)

In [None]:
len(drop_cust_list)

327300

# **Removal of data**

* **To remove the movies and customer from the existing data using the drop_list that we have created**

In [None]:
df = df[~df["MovieID"].isin(drop_Movie_list)] #Data - > isin(check) -> True -> False (remove)


df = df[~df["CustID"].isin(drop_cust_list)]

In [None]:
print(f"Rows: {df.shape[0]} and Columns: {df.shape[1]}")

Rows: 7017289 and Columns: 3


# **Working with recommendation**

In [None]:
movies_df = pd.read_csv("/content/drive/MyDrive/NetflixMovieData.csv", names = ["MovieID", "Year", "Name"], usecols = [0, 1, 2], header = None)

In [None]:
movies_df.head()

Unnamed: 0,MovieID,Year,Name
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW


# **Recommendation System with SVD**

In [None]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357262 sha256=7c37c92976d3e7808f4e670d3943b291bd7afc8bea937caeccf54e3a2785f3be
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Succe

**Import some elements from SVD**

In [None]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate

In [None]:
reader = Reader()

In [None]:
data = Dataset.load_from_df(df[["CustID", "MovieID", "Ratings"]][:100000], reader)

### **Model Building**


In [None]:
model = SVD()

In [None]:
cross_validate(model, data, measures = ["RMSE"], cv = 4)

{'test_rmse': array([1.02454296, 1.02289072, 1.02338211, 1.02280449]),
 'fit_time': (2.309157609939575,
  2.1315159797668457,
  1.7669403553009033,
  1.7993714809417725),
 'test_time': (0.2562386989593506,
  0.13940954208374023,
  0.1564028263092041,
  0.13360214233398438)}

# **Creating filter for recommendation**

In [None]:
data_1331154 = df[(df["CustID"] == "1331154") & (df["Ratings"] == 5.0)]

In [None]:
data_1331154

Unnamed: 0,CustID,Ratings,MovieID
1991774,1331154,5.0,361
2600328,1331154,5.0,482
3417458,1331154,5.0,658
5646194,1331154,5.0,1144
10165725,1331154,5.0,1974
10919877,1331154,5.0,2128
14525287,1331154,5.0,2795
19992284,1331154,5.0,3825
20774457,1331154,5.0,3925


In [None]:
movies_df

Unnamed: 0,MovieID,Year,Name
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW
...,...,...,...
17764,17766,2002.0,Where the Wild Things Are and Other Maurice Se...
17765,17767,2004.0,Fidel Castro: American Experience
17766,17768,2000.0,Epoch
17767,17769,2003.0,The Company


In [None]:
df.head()

Unnamed: 0,CustID,Ratings,MovieID
549,2059652,4.0,2
550,1666394,3.0,2
551,1759415,4.0,2
552,1959936,5.0,2
553,998862,4.0,2


In [None]:
list_of_infinite_possibility = movies_df.copy()

In [None]:
list_of_infinite_possibility.reset_index(inplace = True)

In [None]:
list_of_infinite_possibility = list_of_infinite_possibility[~list_of_infinite_possibility["MovieID"].isin(drop_Movie_list)]

In [None]:
list_of_infinite_possibility["Estimate Score"] = list_of_infinite_possibility["MovieID"].apply(lambda x : model.predict("44937", x).est) #predict(cust_id, movie_id)

In [None]:
list_of_infinite_possibility

Unnamed: 0,index,MovieID,Year,Name,Estimate Score
1,1,2,2004.0,Isle of Man TT 2004 Review,3.559378
6,6,7,1992.0,8 Man,2.368120
14,14,15,1988.0,Neil Diamond: Greatest Hits Live,3.316117
15,15,16,1996.0,Screamers,3.227942
16,16,17,2005.0,7 Seconds,2.949668
...,...,...,...,...,...
17764,17764,17766,2002.0,Where the Wild Things Are and Other Maurice Se...,3.395387
17765,17765,17767,2004.0,Fidel Castro: American Experience,3.395387
17766,17766,17768,2000.0,Epoch,3.395387
17767,17767,17769,2003.0,The Company,3.395387


In [None]:
list_of_infinite_possibility = list_of_infinite_possibility.sort_values('Estimate Score', ascending=False)

**Top 10 recommendation movie to user**

In [None]:
list_of_infinite_possibility.head(10)

Unnamed: 0,index,MovieID,Year,Name,Estimate Score
31,31,32,2004.0,ABC Primetime: Mel Gibson's The Passion of the...,4.113013
75,75,76,1952.0,I Love Lucy: Season 2,4.101488
24,24,25,1997.0,Inspector Morse 31: Death Is Now My Neighbour,3.956402
45,45,46,1964.0,Rudolph the Red-Nosed Reindeer,3.933148
164,164,165,1982.0,Richard Pryor: Live on the Sunset Strip,3.893136
82,82,83,1983.0,Silkwood,3.828089
43,43,44,1996.0,Spitfire Grill,3.70679
102,102,103,1976.0,Sanford and Son: Season 6,3.697524
28,28,29,2001.0,Boycott,3.687981
120,120,121,2003.0,Beyonce: Live at Wembley,3.687579
