### Importing all the important Libraries

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
from dotenv import dotenv_values
from sqlalchemy import create_engine
import sklearn
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import pairwise
from sklearn.metrics import accuracy_score
import statsmodels.formula.api as sm
from scipy.sparse import csr_matrix
import os
import streamlit as sl

### Reading different datasets 

### Filtering Movie titles

### Reading sorted Data

In [None]:
n= pd.read_csv("../data/movie_sorted.csv")

In [None]:
n.tail()

### Reading the rating files for movieid 1 and cleaning the file 

In [None]:
df = pd.read_csv("../data/mv_0000001.txt", names=["customerid","rating","date"] )

In [None]:
df.head()

### checking the first cleaned rating file 

In [None]:
df= pd.read_csv("../data/movieid1_rating.csv")

In [None]:
df.head()

### Creating a function to read and clean all the rating files

### Creating and checking a test version of concatinated rating files

In [None]:
df_s= pd.read_csv("../data/ratings_small.csv", names=["customerid","rating","date", "movieid"])

In [None]:
df_s["date"] = pd.to_datetime(df_s["date"], format = "%Y-%m-%d")

In [None]:
df_s.info()

### Creating the large file containing all the ratings

### Reading and transforming the Qualify  file to add the Machine Learing results

### Reading the cleaned qualify file 

In [None]:
qualifier= pd.read_csv("../data/qualify.csv")

In [None]:
qualifier

### Cleaning the probe file to compare the results of qualify dataset created with help of machine learning Algorithms

### Reading the cleaned probe file

In [None]:
probability = pd.read_csv("../data/probe.csv")

In [None]:
probability

## NEXT TASKS

#### Filter the data to create a better machine learning program

In [None]:
small= pd.read_csv("../data/ratings_small.csv" )
small.head()

In [3]:
movies = pd.read_csv("../data/movie_sorted.csv")
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17770 entries, 0 to 17769
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieid  17770 non-null  int64 
 1   year     17770 non-null  int64 
 2   title    17770 non-null  object
dtypes: int64(2), object(1)
memory usage: 416.6+ KB


In [4]:
ratings = pd.read_csv("../../thymestamps-working-folder/data/filtered-ratings.csv")
ratings.head()

Unnamed: 0,customerid,rating,movieid,title
0,2407893,4.0,6974,The Usual Suspects
1,1673319,5.0,6974,The Usual Suspects
2,602,5.0,6974,The Usual Suspects
3,2604811,4.0,6974,The Usual Suspects
4,1329723,5.0,6974,The Usual Suspects


In [None]:
ratings.info()

In [5]:
ratings["movieid"].value_counts()

1905     120787
11283    113216
15124    113000
15107    109851
5317     108621
          ...  
14021      1283
17096      1234
761        1230
9044       1219
11819      1165
Name: movieid, Length: 5264, dtype: int64

In [6]:
ratings['customerid'].nunique(), ratings['movieid'].nunique(), ratings['movieid'].max()

(150245, 5264, 17769)

In [21]:
user = csr_matrix((ratings['rating'], (ratings['customerid'], ratings['movieid'])))

#### like pivot_table but much more memory efficient

In [22]:
user.shape

(2649430, 17770)

In [23]:
# customerid vector
user[1,:].todense()

matrix([[0., 0., 0., ..., 0., 0., 0.]])

In [24]:
# movieid vector
user[:,1].todense().shape

(2649430, 1)

In [25]:
# collect the movie ids for later

# for calculating recommendations
liked_items = [12748,4979, 12785, 12918,13031,13313,16265,14240,14214]
# for testing the recommender after getting some recommendations
relevant_items = []

### Deciding a Machine learning technique
#### Neighborhood-based Collaborative Filtering( Nearest Neighbors)
#### Neighborhood Components Analysis
#### nearest neighbors classifier

In [26]:
# which metrics can we use
sorted(sklearn.neighbors.VALID_METRICS_SPARSE['brute'])

['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan', 'precomputed']

In [27]:
# initialize the unsupervised model
model = NearestNeighbors(metric='cosine')


In [28]:
# fit it to the user-item matrix
model.fit(user)

NearestNeighbors(metric='cosine')

In [29]:
# top-10 most similar users for the user with the id 1 (id1 is included in the neighborhood!)
model.kneighbors(user[1,:], n_neighbors=20)

(array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1.]]),
 array([[1766282, 1766284, 1766285, 1766286, 1766287, 1766288, 1766289,
         1766290, 1766291, 1766292, 1766293, 1766294, 1766295, 1766296,
         1766297, 1766298, 1766299, 1766281, 1766283, 1766301]],
       dtype=int64))

In [30]:
user[1,:].shape

(1, 17770)

In [33]:
# new user vector
user_vec = np.repeat(0, 17770)
user_vec.shape

# fill in some ratings
user_vec[liked_items] = 5
user_vec.shape

(17770,)

In [34]:
# find the neighborhood
distances, user_ids = model.kneighbors([user_vec], n_neighbors=10)

In [35]:
user_ids

array([[2067162, 1962029,  801872, 1514115, 1131551, 2294599, 1584657,
         238987, 2329643,  462165]], dtype=int64)

In [37]:
# find the ratings for the neighbors
neighborhood = ratings.set_index('customerid').loc[user_ids[0]]
neighborhood

Unnamed: 0_level_0,rating,movieid,title
customerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2067162,5.0,7057,Lord of the Rings: The Two Towers: Extended Ed...
2067162,5.0,7145,Star Trek III: The Search for Spock
2067162,4.0,7193,The Princess Bride
2067162,5.0,7230,The Lord of the Rings: The Fellowship of the R...
2067162,2.0,7234,Men of Honor
...,...,...,...
462165,3.0,6859,Cheaper by the Dozen
462165,4.0,6908,Star Trek V: The Final Frontier
462165,3.0,6911,Little Shop of Horrors
462165,4.0,6971,Ferris Bueller's Day Off


In [38]:
recommendations = neighborhood.groupby('movieid')['rating'].sum().sort_values(ascending=False)
recommendations

movieid
4979     50.0
6655     50.0
14214    50.0
5582     49.0
9628     49.0
         ... 
9590      1.0
14940     1.0
290       1.0
17171     1.0
17387     1.0
Name: rating, Length: 1122, dtype: float64

In [39]:
liked_items

[12748, 4979, 12785, 12918, 13031, 13313, 16265, 14240, 14214]

In [40]:
# filter out movies that the user allready watched
item_filter = ~recommendations.index.isin(liked_items)
recommendations = recommendations.loc[item_filter]

In [41]:
# top 10 recommendations
movies.loc[recommendations.head(10).index]

Unnamed: 0_level_0,movieid,year,title
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6655,6656,2004,10.5
5582,5583,2005,The Marksman
9628,9629,2002,Stranded
7984,7985,1968,Star Trek: The Original Series: Vols. 29-40
138,139,2001,Allergies: A Natural Approach
209,210,2001,Onmyoji
15727,15728,2000,Dead Creatures
15699,15700,1988,The Land Before Time
10764,10765,2001,Rod Steele 0014: You Only Live Until You Die
16937,16938,1992,Inspector Morse 22: Happy Families


In [None]:
recommended_movie_ids = recommendations.head(10).index
recommended_movie_ids

In [None]:
relevant_items

#### Test the algorithm

#### maybe create some visualisations to check the data distribution 

#### Add ratings in Qualify dataset and compare with probe to check the accuracy

#### Create a Predictor system which takes 3 inputs from user and provides list of related movies 

### Create a GUI using Streamlit