# INFO

This is my solution for the third homework problem

# **SOLUTION**

# DATA READING

Let's start with uploading our dataset - remember to set proper delimiter.

In [1]:
delimiter = "::"

Also, set ratings and users files names.

In [2]:
users_file_name = "users.dat"
ratings_file_name = "ratings.dat"

In [3]:
from google.colab import files
from io import StringIO

import pandas as pd
import numpy as np

Function to read and parse csv files

In [4]:
def read_file(uploaded_files, file_name, delimiter, columns, columns_names):
  file_csv = uploaded_files[file_name]

  print('\nFile {name} uploaded ({length} bytes)! Extracting required data...\n'.format( 
      name=file_name, length=len(file_csv)))
  
  file_csv_string = StringIO(file_csv.decode('utf-8'))
  file = pd.read_csv(file_csv_string,
                     sep=delimiter,
                     header=None,
                     usecols=columns,
                     names=columns_names)

  print("Extracted {number_of_lines} lines!".format(
      number_of_lines=len(file)))
  print("First 10 lines of the file:\n ", file[:10])

  return file

Let's start with reading data

In [5]:
uploaded_files = files.upload()

movie_rates = read_file(
    uploaded_files,
    ratings_file_name,
    delimiter,
    [0, 1, 2],
    ["user_id", "movie_id", "rating"])

users_info = read_file(
    uploaded_files,
    users_file_name,
    delimiter,
    [0, 1, 2, 3, 4],
    ["user_id", "gender", "age", "occupation", "zip_code"])

Saving ratings.dat to ratings (2).dat
Saving users.dat to users (1).dat

File ratings.dat uploaded (24594131 bytes)! Extracting required data...



  if sys.path[0] == '':


Extracted 1000209 lines!
First 10 lines of the file:
     user_id  movie_id  rating
0        1      1193       5
1        1       661       3
2        1       914       3
3        1      3408       4
4        1      2355       5
5        1      1197       3
6        1      1287       5
7        1      2804       5
8        1       594       4
9        1       919       4

File users.dat uploaded (134368 bytes)! Extracting required data...

Extracted 6040 lines!
First 10 lines of the file:
     user_id gender  age  occupation zip_code
0        1      F    1          10    48067
1        2      M   56          16    70072
2        3      M   25          15    55117
3        4      M   45           7    02460
4        5      M   25          20    55455
5        6      F   50           9    55117
6        7      M   35           1    06810
7        8      M   25          12    11413
8        9      M   25          17    61614
9       10      F   35           1    95370


  if sys.path[0] == '':


Now group ratings by age and gender

In [6]:
joined_rates_with_user_info = movie_rates.join(users_info.set_index("user_id"), on="user_id")
grouped_movie_ratings = joined_rates_with_user_info.groupby(["gender", "age"])

# ALGORITHM

*Generating itemsets*

Remember to set your `min_support` value (probably `0.2` is too big, but computing power limits it...)

In [28]:
min_support = 0.2

And minimal (inclusive) rating considered as high.

In [29]:
min_high_rating = 4

Function to filter movies with high ratings (rating >= `min_high_rating`)

In [30]:
def get_high_rated_movies(user_ratings):
  high_rated = user_ratings[user_ratings["rating"] >= min_high_rating]
  return set(high_rated["movie_id"].to_list())

And function to get highly rated movies grouped by users

In [31]:
def get_users_high_rated_movies(ratings):
  users_ratings = ratings.groupby("user_id", dropna=False)
  number_of_users = len(users_ratings)

  users_movies = users_ratings \
    .apply(get_high_rated_movies).to_frame()
  users_movies.columns = ["movies"]
  users_movies.reset_index(level=0, inplace=True)

  return users_movies

Counts movies high ratings

In [32]:
def count_movies_set(users_movies, movies):
  return users_movies \
    .apply(lambda user_movies: movies.issubset(user_movies["movies"]), axis=1) \
    .sum()

In [33]:
def calculate_support(users_movies, movies, number_of_users):
  return count_movies_set(users_movies, movies) / number_of_users

Filters out movies with small support (support < `min_support`)

In [34]:
def filter_high_support_movies(users_movies, movies_set):
  number_of_users = len(users_movies)

  all_movies_support = pd.DataFrame(movies_set, columns=["movies"])
  all_movies_support["support"] = all_movies_support.apply(lambda movies: calculate_support(users_movies, movies["movies"], number_of_users), axis=1)
  high_support_movies = all_movies_support[all_movies_support["support"] >= min_support]["movies"]

  return np.array(high_support_movies)

Returns all highly rated and highly supported movies for users

In [35]:
def get_all_movies(users_movies):
  all_movies = set()
  users_movies.apply(lambda user_movies: all_movies.update(user_movies["movies"]), axis=1)
  return np.vectorize(lambda movies: frozenset({movies}))(np.fromiter(all_movies, int))

Generates all possible product from two sets of given length

In [36]:
def generate_set_unions(movies_set, length):
  return np.array(list(set([i.union(j) for i in movies_set for j in movies_set if len(i.union(j)) == length])))

Generates all possible movie sets with high support

In [37]:
def generate_all_movies_sets(users_movies):
  all_movies = get_all_movies(users_movies)
  current_high_support_movies = filter_high_support_movies(users_movies, all_movies)
  current_length = 1

  all_high_support_sets = np.array([])

  while len(current_high_support_movies) > 0:
    current_length += 1
    all_high_support_sets = np.append(all_high_support_sets, current_high_support_movies)
    new_high_support_movies = generate_set_unions(current_high_support_movies, current_length)
    current_high_support_movies = filter_high_support_movies(users_movies, new_high_support_movies)

  return all_high_support_sets

*Generating rules*

Remember to set your `min_confidence` value

In [38]:
min_confidence = 0.1

Returns powerset of given set (modified: https://docs.python.org/3/library/itertools.html#itertools-recipes)

In [40]:
from itertools import chain, combinations

def powerset(base_set):
  return np.array(list(chain.from_iterable(combinations(base_set, r) for r in range(1, len(base_set)))))

Generates all rules for single movie set with high confidence (confidence >= `min_confidence`)

In [41]:
def generate_rules_for_one_set(users_movies, movies_set):
  rules = []
  entire_set_count = count_movies_set(users_movies, movies_set)
  
  for subset in powerset(movies_set):
    subset = frozenset(subset)
    confidence = count_movies_set(users_movies, subset)

    if entire_set_count / confidence >= min_confidence:
      rules.append((subset, movies_set.difference(subset)))

  return rules

Generates all rules for all movie sets

In [42]:
def generate_rules(users_movies, all_movies_sets):
  rules = []

  for movies_set in all_movies_sets:
    rules.extend(generate_rules_for_one_set(users_movies, movies_set))

  return rules

*Lift*

Remember to set your `min_lift` value

In [43]:
min_lift = 1

Filters out rules with lift < `min_lift`

In [44]:
def filter_rules_with_high_lift(users_movies, all_rules):
  rules = []
  number_of_users = len(users_movies)

  for x, y in all_rules:
    x_support = calculate_support(users_movies, x, number_of_users)
    y_support = calculate_support(users_movies, y, number_of_users)
    x_y_support = calculate_support(users_movies, x.union(y), number_of_users)

    lift = x_y_support / (x_support * y_support)

    if lift > min_lift:
      rules.append((x, y))

  return rules

*Apriori*

Main function, returns rules generated with apriori algorithm

In [45]:
def apriori(ratings):
  users_movies = get_users_high_rated_movies(ratings)
  all_movies_sets = generate_all_movies_sets(users_movies)
  rules = generate_rules(users_movies, all_movies_sets)
  return filter_rules_with_high_lift(users_movies, rules)

*All groups* 

Printing rules for extracted groups (gender, age)

In [46]:
print("PRINTING RULES:")
print("==========================")

for group in grouped_movie_ratings:
  print("Group: ", group[0])
  print("Rules: ", apriori(group[1]))
  print("-----------------------------")

PRINTING RULES:
Group:  ('F', 1)


  after removing the cwd from sys.path.


Rules:  [(frozenset({1}), frozenset({2355})), (frozenset({2355}), frozenset({1})), (frozenset({588}), frozenset({919})), (frozenset({919}), frozenset({588})), (frozenset({1}), frozenset({588})), (frozenset({588}), frozenset({1})), (frozenset({3114}), frozenset({2355})), (frozenset({2355}), frozenset({3114})), (frozenset({1}), frozenset({3114})), (frozenset({3114}), frozenset({1})), (frozenset({3114}), frozenset({588})), (frozenset({588}), frozenset({3114})), (frozenset({595}), frozenset({588})), (frozenset({588}), frozenset({595})), (frozenset({1}), frozenset({3114, 2355})), (frozenset({3114}), frozenset({1, 2355})), (frozenset({2355}), frozenset({1, 3114})), (frozenset({1, 3114}), frozenset({2355})), (frozenset({1, 2355}), frozenset({3114})), (frozenset({3114, 2355}), frozenset({1}))]
-----------------------------
Group:  ('F', 18)
Rules:  [(frozenset({608}), frozenset({318})), (frozenset({318}), frozenset({608})), (frozenset({1}), frozenset({2858})), (frozenset({2858}), frozenset({1}

  after removing the cwd from sys.path.


Rules:  [(frozenset({608}), frozenset({318})), (frozenset({318}), frozenset({608})), (frozenset({608}), frozenset({2762})), (frozenset({2762}), frozenset({608})), (frozenset({260}), frozenset({2396})), (frozenset({2396}), frozenset({260})), (frozenset({593}), frozenset({2396})), (frozenset({2396}), frozenset({593})), (frozenset({1307}), frozenset({1197})), (frozenset({1197}), frozenset({1307})), (frozenset({260}), frozenset({1198})), (frozenset({1198}), frozenset({260})), (frozenset({1617}), frozenset({2858})), (frozenset({2858}), frozenset({1617})), (frozenset({2858}), frozenset({1198})), (frozenset({1198}), frozenset({2858})), (frozenset({318}), frozenset({527})), (frozenset({527}), frozenset({318})), (frozenset({593}), frozenset({1197})), (frozenset({1197}), frozenset({593})), (frozenset({2858}), frozenset({2396})), (frozenset({2396}), frozenset({2858})), (frozenset({2762}), frozenset({1198})), (frozenset({1198}), frozenset({2762})), (frozenset({1210}), frozenset({260})), (frozenset

  after removing the cwd from sys.path.


Rules:  [(frozenset({2571}), frozenset({1580})), (frozenset({1580}), frozenset({2571})), (frozenset({480}), frozenset({2028})), (frozenset({2028}), frozenset({480})), (frozenset({2571}), frozenset({1196})), (frozenset({1196}), frozenset({2571})), (frozenset({1210}), frozenset({1580})), (frozenset({1580}), frozenset({1210})), (frozenset({2571}), frozenset({780})), (frozenset({780}), frozenset({2571})), (frozenset({1196}), frozenset({260})), (frozenset({260}), frozenset({1196})), (frozenset({1}), frozenset({1196})), (frozenset({1196}), frozenset({1})), (frozenset({1210}), frozenset({2628})), (frozenset({2628}), frozenset({1210})), (frozenset({1210}), frozenset({1196})), (frozenset({1196}), frozenset({1210})), (frozenset({260}), frozenset({2628})), (frozenset({2628}), frozenset({260})), (frozenset({1210}), frozenset({260})), (frozenset({260}), frozenset({1210})), (frozenset({260}), frozenset({589})), (frozenset({589}), frozenset({260})), (frozenset({2571}), frozenset({2628})), (frozenset(

  after removing the cwd from sys.path.


Rules:  [(frozenset({1036}), frozenset({1198})), (frozenset({1198}), frozenset({1036})), (frozenset({2571}), frozenset({527})), (frozenset({527}), frozenset({2571})), (frozenset({1617}), frozenset({50})), (frozenset({50}), frozenset({1617})), (frozenset({1240}), frozenset({1198})), (frozenset({1198}), frozenset({1240})), (frozenset({1265}), frozenset({2571})), (frozenset({2571}), frozenset({1265})), (frozenset({2571}), frozenset({2916})), (frozenset({2916}), frozenset({2571})), (frozenset({457}), frozenset({110})), (frozenset({110}), frozenset({457})), (frozenset({296}), frozenset({1198})), (frozenset({1198}), frozenset({296})), (frozenset({480}), frozenset({2028})), (frozenset({2028}), frozenset({480})), (frozenset({2762}), frozenset({110})), (frozenset({110}), frozenset({2762})), (frozenset({2858}), frozenset({2700})), (frozenset({2700}), frozenset({2858})), (frozenset({608}), frozenset({318})), (frozenset({318}), frozenset({608})), (frozenset({1617}), frozenset({2028})), (frozenset(

  after removing the cwd from sys.path.


Rules:  [(frozenset({1036}), frozenset({1198})), (frozenset({1198}), frozenset({1036})), (frozenset({2571}), frozenset({527})), (frozenset({527}), frozenset({2571})), (frozenset({1617}), frozenset({50})), (frozenset({50}), frozenset({1617})), (frozenset({1240}), frozenset({1198})), (frozenset({1198}), frozenset({1240})), (frozenset({1265}), frozenset({2571})), (frozenset({2571}), frozenset({1265})), (frozenset({2571}), frozenset({2916})), (frozenset({2916}), frozenset({2571})), (frozenset({457}), frozenset({110})), (frozenset({110}), frozenset({457})), (frozenset({593}), frozenset({1214})), (frozenset({1214}), frozenset({593})), (frozenset({296}), frozenset({1198})), (frozenset({1198}), frozenset({296})), (frozenset({480}), frozenset({2028})), (frozenset({2028}), frozenset({480})), (frozenset({1265}), frozenset({589})), (frozenset({589}), frozenset({1265})), (frozenset({457}), frozenset({1610})), (frozenset({1610}), frozenset({457})), (frozenset({1036}), frozenset({1214})), (frozenset(

  after removing the cwd from sys.path.


Rules:  [(frozenset({1240}), frozenset({1198})), (frozenset({1198}), frozenset({1240})), (frozenset({1196}), frozenset({1270})), (frozenset({1270}), frozenset({1196})), (frozenset({1240}), frozenset({589})), (frozenset({589}), frozenset({1240})), (frozenset({858}), frozenset({1196})), (frozenset({1196}), frozenset({858})), (frozenset({589}), frozenset({1198})), (frozenset({1198}), frozenset({589})), (frozenset({1196}), frozenset({1580})), (frozenset({1580}), frozenset({1196})), (frozenset({260}), frozenset({2028})), (frozenset({2028}), frozenset({260})), (frozenset({1210}), frozenset({1198})), (frozenset({1198}), frozenset({1210})), (frozenset({1200}), frozenset({1240})), (frozenset({1240}), frozenset({1200})), (frozenset({260}), frozenset({1198})), (frozenset({1198}), frozenset({260})), (frozenset({2571}), frozenset({589})), (frozenset({589}), frozenset({2571})), (frozenset({858}), frozenset({1214})), (frozenset({1214}), frozenset({858})), (frozenset({2571}), frozenset({1214})), (froz