
> Format Of An Apriori Result Demo 

    #these are the items that are included in this OrderdStatistic 
    items=frozenset({'Certification Course: Yes', 'daily studing time: 1 - 2 Hour', 'Department: Commerce', 'Gender: Female'})

    #How likely it is for the items to be found independently
    support=0.05531914893617021

The Rules
 
ordered_statistics=[
  OrderedStatistic(

    #items_base => The primary item/s.
    items_base = frozenset({'daily studing time: 1 - 2 Hour', 'Department: Commerce'}), 
    
    #items_add => The secondary item/s. If I get item_base I get item_add    
    items_add=frozenset({'Certification Course: Yes', 'Gender: Female'}), 
    
    #How likely they are to come together
    confidence=0.7222222222222222, 

    #How likely they are to come together (greater than one) vs how likely they are to come individually (less than one)
    lift=3.0858585858585856
    )
  ]

In [6]:
### Data preprocessing

import pandas as pd

# Remove numeric columns
dataframe = pd.read_csv("Student_Behaviour.csv")
dataframe = dataframe.drop(["Height(CM)", "Weight(KG)", "10th Mark", "12th Mark", "college mark"], axis = 1)

# Adjust numeric columns and make them strings
for i in range(len(dataframe)):
  
  # Change salary (numeric) column to string values
  salary = dataframe["salary expectation"][i]
  if int(salary) >= 50000:
    dataframe.loc[i, "salary expectation"] = "Greater Than 50,000$"
  if int(salary) >= 20000 and int(salary) < 50000:
      dataframe.loc[i, "salary expectation"] = "Between 20,000$ and 50,000$"
  if int(salary) < 20000:
     dataframe.loc[i, "salary expectation"] = "Less Than 20,000$"

  # Change percentage (numeric) columnn to string values
  percentage = dataframe["willingness to pursue a career based on their degree  "][i]
  dataframe.loc[i, "willingness to pursue a career based on their degree  "] = str(percentage)

  # Add column name before the value, to make the interpretation in the end easier
  for column_name in dataframe.columns:
     value = dataframe[column_name][i]
     dataframe.loc[i, column_name] = f"{column_name}: {value}"

  dataframe.loc[i, "salary expectation"] = "Between 20,000$ and 50,000$"


In [7]:
### Build Apriori Model

def visualize_rules_2(list_of_results):
    values = []
    for result in list_of_results:
        statistics = []
        # Find item - Append item to statistics list
        primary_behavior = tuple(result[2][0][0])[0]
        statistics.append(primary_behavior)

        consequent_behavior = tuple(result[2][0][1])[0]
        statistics.append(consequent_behavior)

        support = result[1]
        statistics.append(support)

        confidence = result[2][0][2]
        statistics.append(confidence)

        lift = result[2][0][3]
        statistics.append(lift)

        values.append(statistics)

    dataframe = pd.DataFrame(values, columns = ["Primary Behavior", "Consequent Behavior", "Support", "Confidence", "Lift"])
    return dataframe

from apyori import apriori

rules = apriori( 
  transactions = dataframe.values,
  min_support = 0.01,
  min_confidence = 0.2,
  min_lift = 3,
  min_length = 2,
  max_length = 2,
  )

results_2_items = list(rules)

dataframe_2_items = visualize_rules_2(results_2_items)
dataframe_2_items.nlargest(n = 10, columns = ["Lift"])

Unnamed: 0,Primary Behavior,Consequent Behavior,Support,Confidence,Lift
0,Department: B.com Accounting and Finance,daily studing time: More Than 4 hour,0.012766,0.2,5.875
3,Do you like your degree?: No,willingness to pursue a career based on their degree : 25%,0.025532,0.3,5.035714
8,Travelling Time : more than 3 hour,daily studing time: 3 - 4 hour,0.012766,0.3,4.7
7,Financial Status: Fabulous,prefer to study in: Night,0.012766,0.75,3.75
5,Financial Status: Awful,Travelling Time : 1.30 - 2 hour,0.017021,0.285714,3.730159
4,Financial Status: Awful,Stress Level : Awful,0.017021,0.285714,3.533835
1,Department: B.com Accounting and Finance,"salary expectation: Greater Than 50,000$",0.017021,0.266667,3.481481
2,Do you like your degree?: No,Financial Status: Awful,0.017021,0.2,3.357143
6,Financial Status: Fabulous,Travelling Time : 0 - 30 minutes,0.012766,0.75,3.204545
9,social medai & video: 0 Minute,willingness to pursue a career based on their degree : 100%,0.012766,0.6,3.204545


In [11]:
### Build model to analyze relationship of 3 factors
### Then visualize rules in a dataframe

# NOTE: I used an adjusted way, other than the one used in min_length/max_length = 2, because when we have larger relations, the numbers become more dynamic. For example some relations will have 2 items, some 3... Making it more complex to capture al of them into seperate columns. 

def visualize_rules_3(list_of_results):
    values = []
    for result in list_of_results:
        support = result[1]
        ordered_statistics = result[2]

        # Loop into the ordered statistics beacuse some of them have more than one possible rule
        for i in range(len(ordered_statistics)):
            statistics = []                               
            items_base = list(ordered_statistics[i][0])  ## Returned them as a list because some of them have 1 base and 2 add, ##
            items_add = list(ordered_statistics[i][1])  ##  Others have 2 base and 1 add                                        ##
                                                        
            confidence = f"{ordered_statistics[i][2]:.2}"
            lift = f"{ordered_statistics[i][3]:.2}"

            # Add individual statistics into statistics list 
            statistics.append( items_base )
            statistics.append( items_add )
            statistics.append( support )
            statistics.append( confidence )
            statistics.append( float(lift) ) # Changed it to float so that I could use it to sort the Dataframe

        # Add all statistics into values list to then change them into a Dataframe
        values.append( statistics )

    dataframe = pd.DataFrame(values, columns = ["Primary Behavior/s", "Consequent Behavior/s", "Support", "Confidence", "Lift"])

    pd.set_option('display.max_colwidth', None)  # Ensures that individual columns are not truncated

    print(f"Length of Dataframe: {len(dataframe)}")
    return dataframe

rules_3_items = apriori(
  transactions = dataframe.values,
  min_support = 0.02,
  min_confidence = 0.6,
  min_lift = 3,
  min_length = 3,
  max_length = 3
)

results_3_items = list(rules_3_items)

dataframe = visualize_rules_3( list_of_results = results_3_items )
dataframe.nlargest( n = 10, columns = ["Lift"])

Length of Dataframe: 18


Unnamed: 0,Primary Behavior/s,Consequent Behavior/s,Support,Confidence,Lift
15,"[daily studing time: 2 - 3 hour, salary expectation: Less Than 20,000$]",[willingness to pursue a career based on their degree : 100%],0.021277,0.83,4.5
3,"[prefer to study in: Morning, Department: B.com ISM]",[social medai & video: 1 - 30 Minute],0.029787,0.88,4.4
4,"[Department: B.com ISM, social medai & video: 30 - 60 Minute]",[prefer to study in: Night],0.029787,0.88,4.4
10,"[social medai & video: 1.30 - 2 hour, Gender: Female]",[hobbies: Reading books],0.021277,0.62,4.1
16,"[hobbies: Cinema, daily studing time: 3 - 4 hour]",[willingness to pursue a career based on their degree : 100%],0.021277,0.71,3.8
17,"[prefer to study in: Anytime, daily studing time: 3 - 4 hour]",[willingness to pursue a career based on their degree : 100%],0.021277,0.71,3.8
1,"[Travelling Time : 0 - 30 minutes, Department: B.com ISM]",[social medai & video: 1 - 30 Minute],0.021277,0.71,3.6
8,"[Financial Status: good, Stress Level : Awful]",[daily studing time: 0 - 30 minute],0.021277,0.71,3.6
11,"[Gender: Male, daily studing time: 3 - 4 hour]",[part-time job: Yes],0.021277,0.62,3.6
9,"[social medai & video: 1.30 - 2 hour, Gender: Female]",[daily studing time: 1 - 2 Hour],0.029787,0.88,3.4
