In [1]:
import pandas as pd
import numpy as np

In [2]:
ratings = pd.read_csv("5 ratings_mutual_information_optimized.csv")
movies = pd.read_csv("5 movies_bicluster.csv", low_memory=False)

In [3]:
def bi_clustering_recommender(uid, ratings, movies):
    """
    Input:
    uid - ID of the user
    ratings - DataFrame that contains the ratings in a special structure
    movies - DataFrame that contains the movies in a special structure
    
    Output:
    suggested_movies - The most popular 'k' suggested movies
    
    Error codes:
    -1 - The user has no ratings
    """
    
    k = 20 # Number of suggested movies
    
    if sum(ratings['userId'] == uid) == 0:
        return -1
        
    # Joint distribution
    nmovieClusters = ratings['movieCluster'].nunique()
    nuserClusters = ratings['userCluster'].nunique()
    relationMatrix = np.zeros((nmovieClusters,nuserClusters))
    
    for i in range(nmovieClusters):
        for j in range (nuserClusters):
            relationMatrix[i,j] = ratings[(ratings['movieCluster'] == i) & 
                                      (ratings['userCluster'] == j)].shape[0] / ratings.shape[0]
    
    # Relation's of the given user's cluster
    relations = relationMatrix[:, ratings[ratings['userId'] == uid]['userCluster'].iloc[0] ]
    best_movie_cluster = np.argmax(relations)
    
    indexes_in_best_cluster = ratings[ratings['movieCluster'] == best_movie_cluster]['movieId'].unique()
    watched_movies_indexes = ratings[ratings['userId'] == uid]['movieId']
    
    suggested_movies = movies[['id', 'popularity', 'title']]
    suggested_movies = suggested_movies[suggested_movies['id'].isin(indexes_in_best_cluster)]
    suggested_movies = suggested_movies[suggested_movies['id'].isin(watched_movies_indexes) == False]
    suggested_movies = suggested_movies.sort_values('popularity', ascending=False)
    
    return suggested_movies.head(k)['title']

In [20]:
bi_clustering_recommender(1, ratings, movies)

78                                           Pulp Fiction
3089                                      The Dark Knight
146                                          Blade Runner
4785                       Dawn of the Planet of the Apes
824                                            Fight Club
4796                              Guardians of the Galaxy
86                               The Shawshank Redemption
95                                           Forrest Gump
1640    Pirates of the Caribbean: The Curse of the Bla...
67                                              Star Wars
140                                      Schindler's List
203                                         The Godfather
1454                                        Spirited Away
650                                     Life Is Beautiful
1298             Harry Potter and the Philosopher's Stone
5148                              Avengers: Age of Ultron
334                                The Godfather: Part II
716           

In [21]:
bi_clustering_recommender(897234, ratings, movies)

-1

# Coverage

In [38]:
suggested_movies = []
users = np.random.choice(ratings['userId'].unique(), size=100)
for i in users:
    suggested_movies.append(bi_clustering_recommender(i, ratings, movies))

In [39]:
list = []
for i in range(len(suggested_movies)):
    if type(suggested_movies[i]) == pd.core.series.Series:
        for j in suggested_movies[i].index:
            list.append(j)

In [40]:
len(np.unique(list))

24

In [42]:
ratings.movieId.nunique()

7082

# Personalization

In [4]:
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
suggested_movies = []
users = np.random.choice(ratings['userId'].unique(), size=100)
for i in users:
    tmp = bi_clustering_recommender(i, ratings, movies)
    if type(tmp) != int:
        suggested_movies.append(tmp)

In [6]:
list = []
for i in range(len(suggested_movies)):
    for j in suggested_movies[i]:
        list.append(j)
        
list = np.unique(list)

In [7]:
data = np.zeros(shape=(len(suggested_movies), len(list)))
seen_movies = pd.DataFrame(data=data, columns=list)

In [8]:
for i in range(len(suggested_movies)):
    for j in suggested_movies[i]:
        if j in seen_movies.columns:
            seen_movies.loc[i,j] = 1

In [9]:
matrix = pd.DataFrame(cosine_similarity(seen_movies))

In [10]:
matrix = 1-matrix

In [11]:
matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,...,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,5.000000e-02,-2.220446e-16,0.000000e+00,0.000000e+00,5.000000e-02,0.00
1,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,...,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,5.000000e-02,-2.220446e-16,0.000000e+00,0.000000e+00,5.000000e-02,0.00
2,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,...,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,5.000000e-02,-2.220446e-16,0.000000e+00,0.000000e+00,5.000000e-02,0.00
3,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,...,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,5.000000e-02,-2.220446e-16,0.000000e+00,0.000000e+00,5.000000e-02,0.00
4,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,...,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,5.000000e-02,-2.220446e-16,0.000000e+00,0.000000e+00,5.000000e-02,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,...,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,5.000000e-02,-2.220446e-16,-2.220446e-16,-2.220446e-16,5.000000e-02,0.00
96,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,5.000000e-02,-2.220446e-16,-2.220446e-16,-2.220446e-16,5.000000e-02,0.00
97,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,-2.220446e-16,-2.220446e-16,-2.220446e-16,-2.220446e-16,5.000000e-02,-2.220446e-16,-2.220446e-16,-2.220446e-16,5.000000e-02,0.00
98,5.000000e-02,5.000000e-02,5.000000e-02,5.000000e-02,5.000000e-02,5.000000e-02,5.000000e-02,5.000000e-02,5.000000e-02,5.000000e-02,...,5.000000e-02,5.000000e-02,5.000000e-02,5.000000e-02,-2.220446e-16,5.000000e-02,5.000000e-02,5.000000e-02,-2.220446e-16,0.05


In [12]:
personalization = []
for i in range(matrix.shape[0]):
    for j in range(matrix.shape[1]):
        if i < j:
            personalization.append(matrix.loc[i,j])

In [13]:
np.mean(personalization)

0.017747474747474512

# Intra-list similarity

In [24]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import MinMaxScaler

In [25]:
movies = pd.read_csv('4 movies.csv')
country_codes = pd.read_csv('country_codes.csv')

In [26]:
def get_numbers(str):
    """Return the integer numbers from a string in an array format."""
    arr = str.replace(',','').replace('{','').replace('}','').replace('[','').replace(']','').split()
    arr = [int(s) for s in arr if s.isdigit()]
    return arr

def get_country_codes(str):
    """Return the codes of the countries from a string."""
    arr = str.replace(',','').replace('{','').replace('}','').replace('[','').replace(']','').replace("'", "").split()
    arr = [s for s in arr if country_codes['Code'].str.contains(s).any()]
    return arr

def prepare_dataframe(movies):
    
    # Conversion
    movies['genres'] = movies['genres'].apply(lambda str: get_numbers(str))
    movies['production_companies'] = movies['production_companies'].apply(lambda str: get_numbers(str))
    movies['production_countries'] = movies['production_countries'].apply(lambda str: get_country_codes(str)) # This takes time
    
    # Get dummies
    mlb = MultiLabelBinarizer()
    genres = pd.DataFrame(mlb.fit_transform(movies['genres']), columns=mlb.classes)

    mlb = MultiLabelBinarizer()
    production_companies = pd.DataFrame(mlb.fit_transform(movies['production_companies']), columns=mlb.classes)

    mlb = MultiLabelBinarizer()
    production_countries = pd.DataFrame(mlb.fit_transform(movies['production_countries']), columns=mlb.classes)

    original_language = pd.get_dummies(movies['original_language'])
    
    # Rename the columns to unique names
    for i in range(genres.shape[1]):
        genres.rename(mapper={i : str(i) + '. genre'}, axis=1, inplace=True)
    
    for i in range(production_countries.shape[1]):
        production_countries.rename(mapper={i : str(i) + '. country'}, axis=1, inplace=True)
    
    movieids = movies['id']
    movies = movies.drop(['id', 'title'], axis=1)
    
    # production_companies, original languages, production countries are not used, because of huge running time
    numeric = movies.drop(['adult', 'genres', 'original_language', 'production_companies', 'production_countries'], axis=1)
    categorical = movies[['adult']].join(genres)
    
    # Scale the numeric values to (0, 1) interval
    scaler = MinMaxScaler()
    numeric = scaler.fit_transform(numeric)
    
    movievectors = pd.DataFrame(numeric).join(categorical)
    movievectors.index = movieids
    
    return movievectors

def convert_title_to_index(x):
    row = unprepared_movies2[unprepared_movies2['title'] == x]
    return np.array(row['id'])[0]

In [27]:
movies = prepare_dataframe(movies)

In [29]:
movies.drop(movies.iloc[:, 0:58], axis=1, inplace=True)

In [31]:
# For all 100 suggestions
intra_list_100 = []
for i in suggested_movies:
    matrix = pd.DataFrame(cosine_similarity(movies[movies.index.isin(i.index)]))
    
    intra_list = []
    for i in range(matrix.shape[0]):
        for j in range(matrix.shape[1]):
            if i < j:
                intra_list.append(matrix.loc[i,j])
    
    intra_list_100.append(np.mean(intra_list))

In [32]:
np.mean(intra_list_100)

0.4249576538263387