In [173]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import apyori as ap

# Importation of dataSet

In [174]:
# import dataset
movies = pd.read_csv("movie_dataset.csv", header=None)
nums_records = len(movies)
print("Number of records: ", nums_records)


Number of records:  7501


### In this part we will convert our dataframe to a list so we can pass it as a param in apriori function
### And removing nan value to prevent to have an association with 'nan'

In [175]:
movies_list = []
# convert the dataset into a list of lists of strings
for transaction in range(0 , nums_records):
    # append each movie to the list of movies and remove nan values
    movies_list.append([str(movies.values[transaction,movie]) for movie in range(0,20) if str(movies.values[transaction,movie]) != 'nan'])
print(movies_list[0])


['The Revenant', '13 Hours', 'Allied', 'Zootopia', 'Jigsaw', 'Achorman', 'Grinch', 'Fast and Furious', 'Ghostbusters', 'Wolverine', 'Mad Max', 'John Wick', 'La La Land', 'The Good Dunosaur', 'Ninja Turtles', 'The Good Dunosaur Bad Moms', '2 Guns', 'Inside Out', 'Valerian', 'Spiderman 3']


### Use of apriori algorithm to find the most frequent movies association in the dataset

In [176]:
# apply apriori algorithm to the dataset
association_rules = ap.apriori(
    movies_list, min_support=0.0053, min_confidence=0.2, min_lift=3, max_length=2
)
# convert the rules into a list
association_results = list(association_rules)

print(association_results)
    

[RelationRecord(items=frozenset({'Green Lantern', 'Red Sparrow'}), support=0.005732568990801226, ordered_statistics=[OrderedStatistic(items_base=frozenset({'Red Sparrow'}), items_add=frozenset({'Green Lantern'}), confidence=0.3006993006993007, lift=3.790832696715049)]), RelationRecord(items=frozenset({'Green Lantern', 'Star Wars'}), support=0.005865884548726837, ordered_statistics=[OrderedStatistic(items_base=frozenset({'Star Wars'}), items_add=frozenset({'Green Lantern'}), confidence=0.3728813559322034, lift=4.700811850163794)]), RelationRecord(items=frozenset({'Kung Fu Panda', 'Jumanji'}), support=0.015997866951073192, ordered_statistics=[OrderedStatistic(items_base=frozenset({'Kung Fu Panda'}), items_add=frozenset({'Jumanji'}), confidence=0.3234501347708895, lift=3.2919938411349285)]), RelationRecord(items=frozenset({'Wonder Woman', 'Jumanji'}), support=0.005332622317024397, ordered_statistics=[OrderedStatistic(items_base=frozenset({'Wonder Woman'}), items_add=frozenset({'Jumanji'})

### Extract the rules and their confidence,support and lift from the apriori result

In [177]:
results = []
for record in association_results:
    for ordered_stat in record.ordered_statistics:
        items_base = list( ordered_stat.items_base)[0]
        items_add = list(ordered_stat.items_add)[0]
        confidence = ordered_stat.confidence
        lift = ordered_stat.lift
        support = record.support
        result_dict = {
            "base_items": items_base,
            "add_items": items_add,
            "support": support,
            "confidence": confidence,
            "lift": lift,
        }
        results.append(result_dict)


results_df = pd.DataFrame(results)
print(results_df)

              base_items      add_items   support  confidence      lift
0            Red Sparrow  Green Lantern  0.005733    0.300699  3.790833
1              Star Wars  Green Lantern  0.005866    0.372881  4.700812
2          Kung Fu Panda        Jumanji  0.015998    0.323450  3.291994
3           Wonder Woman        Jumanji  0.005333    0.377358  3.840659
4  The Spy Who Dumped Me    Spiderman 3  0.007999    0.271493  4.122410


### Second method to extract information without using a nested loop

In [178]:
# second method without second for loop
results2 = []
for record in association_results:
    items_base = list(record.ordered_statistics[0].items_base)[0]
    items_add = list(record.ordered_statistics[0].items_add)[0]
    confidence = record.ordered_statistics[0].confidence
    lift = record.ordered_statistics[0].lift
    support = record.support
    result_dict = {
        "base_items": items_base,
        "add_items": items_add,
        "support": support,
        "confidence": confidence,
        "lift": lift,
    }
    results2.append(result_dict)
results2 = pd.DataFrame(results2)
print(results2)

              base_items      add_items   support  confidence      lift
0            Red Sparrow  Green Lantern  0.005733    0.300699  3.790833
1              Star Wars  Green Lantern  0.005866    0.372881  4.700812
2          Kung Fu Panda        Jumanji  0.015998    0.323450  3.291994
3           Wonder Woman        Jumanji  0.005333    0.377358  3.840659
4  The Spy Who Dumped Me    Spiderman 3  0.007999    0.271493  4.122410
