In [1]:
#loading necessary libraries and changing some settings------------------------------------------------------------------------------------------------
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from random import sample

pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [2]:
df = pd.read_csv('imdb_top_1000.csv', header=0)

#Assuring that features are the right dtypes and dropping Nas------------------------------------------------------------------------------------------------
df['Released_Year'] = pd.Series(list(x if x.isnumeric() else np.nan for x in df['Released_Year']))
df['Runtime'] = pd.Series(list(x.split()[0] if x.split()[0].isnumeric() else np.nan for x in df['Runtime']))
df['Released_Year'] = df['Released_Year'].astype(float)
df['Runtime'] = df['Runtime'].astype(float)
df['Gross'] = pd.Series(list(x if isinstance(x, float) else float(x.replace(',', '')) for x in df['Gross']))
df = df.dropna(axis=0)

#Standardization and Adding Other Features--------------------------------------------------------------------------------------------------------------------
st = ['Released_Year', 'Runtime', 'No_of_Votes', 'Gross', 'IMDB_Rating']
scaler = StandardScaler().fit(df[st])
dfGenre = df['Genre'].str.replace(',','|').str.replace(' ','').str.get_dummies()[['Action', 'Adventure', 'Horror', 'Romance', 'Sci-Fi', 'Drama', 'Comedy', 'Family', 'Animation']]
dfRest = df[['Poster_Link', 'Series_Title']]
dfFinal = pd.merge(dfRest, dfGenre, left_index=True, right_index=True)
dfFinal[st] = scaler.transform(df[st])
dfFinal.reset_index(drop=True)

dfFinal.to_csv('dfFinal.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 713 entries, 0 to 997
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Poster_Link    713 non-null    object 
 1   Series_Title   713 non-null    object 
 2   Released_Year  713 non-null    float64
 3   Certificate    713 non-null    object 
 4   Runtime        713 non-null    float64
 5   Genre          713 non-null    object 
 6   IMDB_Rating    713 non-null    float64
 7   Overview       713 non-null    object 
 8   Meta_score     713 non-null    float64
 9   Director       713 non-null    object 
 10  Star1          713 non-null    object 
 11  Star2          713 non-null    object 
 12  Star3          713 non-null    object 
 13  Star4          713 non-null    object 
 14  No_of_Votes    713 non-null    int64  
 15  Gross          713 non-null    float64
dtypes: float64(5), int64(1), object(10)
memory usage: 110.9+ KB


In [3]:
#Examplary Search Allgorithm for the perfect Match------------------------------------------------------------------------------------------------------------
genres = [
    "Action",
    "Romance",
    "Comedy",
    "Horror",
    "Drama",
    "Adventure",
    "Family",
    "Sci-Fi"
]

#Searches for the pair with the highest heterogenity in a specific feature while maintaining the highest possible homogenity when looking at other features.
#We do this so we get the most information possible out of a question. For example, the algorithm will pick two movies that are very similar in genre, gross, No_of_Votes etc. but drastically different
#considering the Release Year. For that reason, we can assume that the customer is in the mood for a newer movie, if he chooses the modern one.

def questions(df, column):
    css = np.round(cosine_similarity(df.loc[:, ~df.columns.isin(['Poster_Link', 'Series_Title', column])]), 15)
    css = np.where(css < 0, 0, css)
    css = np.where(css == 1, 0, css)
    difference = [[abs(a-b) for a in df[column]] for b in df[column]]
    value = np.multiply(css, difference)
    index = np.argwhere(value == np.max(value))
    if df.iloc[index[0][0]][column] > df.iloc[index[0][1]][column]:
        return (df.iloc[index[0][0]][['Series_Title', column]], index[0][0]), (df.iloc[index[0][1]][['Series_Title', column]],  index[0][1])
    else:
        return (df.iloc[index[0][1]][['Series_Title', column]],  index[0][1]), (df.iloc[index[0][0]][['Series_Title', column]], index[0][0])

#Gets Films that represent the genres (we tried going for only the most popular ones, but it actually worsens the recommendations)
def get_representative_film_from_genre(df, genre):
    films = df[df[genre] == 1]
    return sample(list(films["Series_Title"].iloc[:6]), 1)[0]

#Compares the different genres iteratively, until we can narrow the dataset down on just one category
def genre_ranking(arr):
    res = list()
    while True:
        for i in range(0, len(arr) - 1, 2):
            if input(f"{get_representative_film_from_genre(dfFinal, arr[i])} > {get_representative_film_from_genre(dfFinal, arr[i + 1])}") == "Y":
                res.append(arr[i])
            else:
                res.append(arr[i + 1])
                # counters[arr[i + 1]] += 1
        if len(res) > 1 or len(arr) == 3:
            if len(arr) % 2 == 1:
                res.append(arr[-1])
            arr, res = res, list()
        else:
            break
    return res[0]

#Lets the User decide between two films, then uses only the 50% of the data that were above/below the median (depending on the decision).
def decision(df, column, column_answer1, column_answer2): 
    if input(f"{column_answer1[0][0]} > {column_answer2[0][0]}")  == 'Y':
        return df[df[column] > df[column].median()]
    else:
        return df[df[column] < df[column].median()]

def program(df, genres):
    genre = genre_ranking(genres)
    print(genre)
    df = df[(df[genre] == 1)]
    # aquestions1, aquestions2 = questions(df, 'Animation')
    # df = decision(df, "Animation", aquestions1, aquestions2)
    rquestions1, rquestions2 = questions(df, 'Released_Year')
    df = decision(df, "Released_Year", rquestions1, rquestions2)
    gquestions1, gquestions2 = questions(df, 'Gross')
    recommendations = decision(df, "Gross", gquestions1, gquestions2)
    return recommendations.sort_values(by = 'IMDB_Rating', ascending = False).head(3)

program(dfFinal, genres)

Action


Unnamed: 0,Poster_Link,Series_Title,Action,Adventure,Horror,Romance,Sci-Fi,Drama,Comedy,Family,Animation,Released_Year,Runtime,No_of_Votes,Gross,IMDB_Rating
5,https://m.media-amazon.com/images/M/MV5BNzA5ZD...,The Lord of the Rings: The Return of the King,1,1,0,0,0,1,0,0,0,0.39083,2.9871,3.63535,2.60581,3.28462
10,https://m.media-amazon.com/images/M/MV5BN2EyZj...,The Lord of the Rings: The Fellowship of the Ring,1,1,0,0,0,1,0,0,0,0.28322,2.09839,3.68826,2.06369,2.94333
13,https://m.media-amazon.com/images/M/MV5BZGMxZT...,The Lord of the Rings: The Two Towers,1,1,0,0,0,1,0,0,0,0.33703,2.13703,3.19113,2.29869,2.60204
