### Importing all the important Libraries

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
from dotenv import dotenv_values
from sqlalchemy import create_engine
import sklearn
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import pairwise
from sklearn.metrics import accuracy_score
import statsmodels.formula.api as sm
from scipy.sparse import csr_matrix
import os
import streamlit as sl

### Reading different datasets 

### Filtering Movie titles

### Reading sorted Data

In [None]:
n= pd.read_csv("../data/movie_sorted.csv")

In [None]:
n.tail()

### Reading the rating files for movieid 1 and cleaning the file 

In [None]:
df = pd.read_csv("../data/mv_0000001.txt", names=["customerid","rating","date"] )

In [None]:
df.head()

### checking the first cleaned rating file 

In [None]:
df= pd.read_csv("../data/movieid1_rating.csv")

In [None]:
df.head()

### Creating a function to read and clean all the rating files

### Creating and checking a test version of concatinated rating files

In [None]:
df_s= pd.read_csv("../data/ratings_small.csv", names=["customerid","rating","date", "movieid"])

In [None]:
df_s["date"] = pd.to_datetime(df_s["date"], format = "%Y-%m-%d")

In [None]:
df_s.info()

### Creating the large file containing all the ratings

### Reading and transforming the Qualify  file to add the Machine Learing results

### Reading the cleaned qualify file 

In [None]:
qualifier= pd.read_csv("../data/qualify.csv")

In [None]:
qualifier

### Cleaning the probe file to compare the results of qualify dataset created with help of machine learning Algorithms

### Reading the cleaned probe file

In [None]:
probability = pd.read_csv("../data/probe.csv")

In [None]:
probability

## NEXT TASKS

#### Filter the data to create a better machine learning program

In [None]:
small= pd.read_csv("../data/ratings_small.csv" )
small.head()

In [3]:
movies = pd.read_csv("../data/movie_sorted.csv")
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17770 entries, 0 to 17769
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieid  17770 non-null  int64 
 1   year     17770 non-null  int64 
 2   title    17770 non-null  object
dtypes: int64(2), object(1)
memory usage: 416.6+ KB


In [4]:
ratings = pd.read_csv("../../thymestamps-working-folder/data/filtered-ratings.csv")
ratings.head()

Unnamed: 0,customerid,rating,movieid,title
0,2407893,4.0,6974,The Usual Suspects
1,1673319,5.0,6974,The Usual Suspects
2,602,5.0,6974,The Usual Suspects
3,2604811,4.0,6974,The Usual Suspects
4,1329723,5.0,6974,The Usual Suspects


In [None]:
ratings.info()

In [5]:
ratings["movieid"].value_counts()

1905     120787
11283    113216
15124    113000
15107    109851
5317     108621
          ...  
14021      1283
17096      1234
761        1230
9044       1219
11819      1165
Name: movieid, Length: 5264, dtype: int64

In [6]:
ratings['customerid'].nunique(), ratings['movieid'].nunique(), ratings['movieid'].max()

(150245, 5264, 17769)

In [7]:
user = csr_matrix((ratings['rating'], (ratings['movieid'], ratings['customerid'])))

#### like pivot_table but much more memory efficient

In [8]:
user.shape

(17770, 2649430)

In [9]:
# customerid vector
user[1,:].todense()

matrix([[0., 0., 0., ..., 0., 0., 0.]])

In [10]:
# movieid vector
user[:,1].todense().shape

(17770, 1)

In [11]:
# collect the movie ids for later

# for calculating recommendations
liked_items = [12748,4979, 12785, 12918,13031,13313,16265,14240,14214]
# for testing the recommender after getting some recommendations
relevant_items = []

### Deciding a Machine learning technique
#### Neighborhood-based Collaborative Filtering( Nearest Neighbors)
#### Neighborhood Components Analysis
#### nearest neighbors classifier

In [12]:
# which metrics can we use
sorted(sklearn.neighbors.VALID_METRICS_SPARSE['brute'])

['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan', 'precomputed']

In [13]:
# initialize the unsupervised model
model = NearestNeighbors(metric='cosine')


In [14]:
# fit it to the user-item matrix
model.fit(user)

NearestNeighbors(metric='cosine')

In [15]:
# top-10 most similar users for the user with the id 1 (id1 is included in the neighborhood!)
model.kneighbors(user[1,:], n_neighbors=20)

(array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1.]]),
 array([[11852, 11866, 11836, 11851, 11850, 11849, 11848, 11847, 11846,
         11845, 11844, 11843, 11842, 11841, 11840, 11839, 11838, 11853,
         11865, 11864]], dtype=int64))

In [16]:
user[1,:].shape

(1, 2649430)

In [17]:
# new user vector
user_vec = np.repeat(0, 2649430)
user_vec.shape

# fill in some ratings
user_vec[liked_items] = 5
user_vec.shape

(2649430,)

In [18]:
# find the neighborhood
distances, user_ids = model.kneighbors([user_vec], n_neighbors=10)

In [19]:
user_ids

array([[11845, 11843, 11850, 11849, 11848, 11847, 11846, 11852, 11844,
        11840]], dtype=int64)

In [20]:
# find the ratings for the neighbors
neighborhood = ratings.set_index('movieid').loc[user_ids[0]]
neighborhood

KeyError: '[11845, 11849, 11847, 11852] not in index'

In [None]:
recommendations = neighborhood.groupby('movieid')['rating'].sum().sort_values(ascending=False)
recommendations

In [None]:
liked_items

In [None]:
# filter out movies that the user allready watched
item_filter = ~recommendations.index.isin(liked_items)
recommendations = recommendations.loc[item_filter]

In [None]:
# top 10 recommendations
movies.loc[recommendations.head(10).index]

In [None]:
recommended_movie_ids = recommendations.head(10).index
recommended_movie_ids

In [None]:
relevant_items

#### Test the algorithm

#### maybe create some visualisations to check the data distribution 

#### Add ratings in Qualify dataset and compare with probe to check the accuracy

#### Create a Predictor system which takes 3 inputs from user and provides list of related movies 

### Create a GUI using Streamlit