<a href="https://colab.research.google.com/github/Kirtikaa25/redLight/blob/main/apriori2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import pandas as pd

# Load Excel
file_path = "/content/demographical_factors.xlsx"  # If you're in Google Colab
df = pd.read_excel(file_path)

df.head()


Unnamed: 0,Age Group,Gender,Occupation,Approximate monthly income of the person who violates traffic signals\n,Education level,Driving Experience,Have you ever crossed the street during a red light?
0,24.0,1,2,35000,4,6.0,0
1,24.0,1,1,18000,3,0.5,1
2,24.0,1,1,18000,3,3.0,1
3,24.0,1,1,18000,3,6.0,0
4,24.0,1,1,18000,3,3.0,1


In [8]:
# Mapping dictionaries
occupation_map = {1: 'Occupation_Student', 2: 'Occupation_Employed', 3: 'Occupation_Businessman', 4: 'Occupation_Retired'}
education_map = {1: 'Education level_Below_Metric', 2: 'Education level_Higher_Secondary', 3: 'Education level_Graduate', 4: 'Education level_Postgraduate'}
gender_map = {1: 'Gender_Male', 2: 'Gender_Female'}

# Apply mappings
df['Occupation'] = df['Occupation'].map(occupation_map)
df['Education level'] = df['Education level'].map(education_map)
df['Gender'] = df['Gender'].map(gender_map)

df.head()


Unnamed: 0,Age Group,Gender,Occupation,Approximate monthly income of the person who violates traffic signals\n,Education level,Driving Experience,Have you ever crossed the street during a red light?
0,24.0,Gender_Male,Occupation_Employed,35000,Education level_Postgraduate,6.0,0
1,24.0,Gender_Male,Occupation_Student,18000,Education level_Graduate,0.5,1
2,24.0,Gender_Male,Occupation_Student,18000,Education level_Graduate,3.0,1
3,24.0,Gender_Male,Occupation_Student,18000,Education level_Graduate,6.0,0
4,24.0,Gender_Male,Occupation_Student,18000,Education level_Graduate,3.0,1


In [9]:
# Bin Driving Experience
df['Driving_Exp_Bin'] = pd.cut(df['Driving Experience'],
                               bins=[0, 2, 4, 100],
                               labels=['Driving_Exp_0_2', 'Driving_Exp_3_4', 'Driving_Exp_5plus'])

# Rename income column for ease
df.rename(columns={'Approximate monthly income of the person who violates traffic signals\n': 'Income'}, inplace=True)

# Bin Income
df['Income_Bin'] = pd.cut(df['Income'],
                          bins=[0, 20000, 40000, 120000],
                          labels=['Income_upto20k', 'Income_21k_40k', 'Income_above_40k'])

# Bin Age
df['Age_Bin'] = pd.cut(df['Age Group'],
                       bins=[16, 25, 35, 45, 60],
                       labels=['Age_20_25', 'Age_26_35', 'Age_36_45', 'Age_46plus'])

df.head()


Unnamed: 0,Age Group,Gender,Occupation,Income,Education level,Driving Experience,Have you ever crossed the street during a red light?,Driving_Exp_Bin,Income_Bin,Age_Bin
0,24.0,Gender_Male,Occupation_Employed,35000,Education level_Postgraduate,6.0,0,Driving_Exp_5plus,Income_21k_40k,Age_20_25
1,24.0,Gender_Male,Occupation_Student,18000,Education level_Graduate,0.5,1,Driving_Exp_0_2,Income_upto20k,Age_20_25
2,24.0,Gender_Male,Occupation_Student,18000,Education level_Graduate,3.0,1,Driving_Exp_3_4,Income_upto20k,Age_20_25
3,24.0,Gender_Male,Occupation_Student,18000,Education level_Graduate,6.0,0,Driving_Exp_5plus,Income_upto20k,Age_20_25
4,24.0,Gender_Male,Occupation_Student,18000,Education level_Graduate,3.0,1,Driving_Exp_3_4,Income_upto20k,Age_20_25


In [10]:
df['Items'] = df[['Gender', 'Education level', 'Occupation',
                  'Driving_Exp_Bin', 'Income_Bin', 'Age_Bin']].values.tolist()
df[['Items', 'Have you ever crossed the street during a red light?']].head()


Unnamed: 0,Items,Have you ever crossed the street during a red light?
0,"[Gender_Male, Education level_Postgraduate, Oc...",0
1,"[Gender_Male, Education level_Graduate, Occupa...",1
2,"[Gender_Male, Education level_Graduate, Occupa...",1
3,"[Gender_Male, Education level_Graduate, Occupa...",0
4,"[Gender_Male, Education level_Graduate, Occupa...",1


In [14]:
# Ensure all items are strings
from mlxtend.preprocessing import TransactionEncoder

transactions_str = [[str(item) for item in transaction] for transaction in df['Items']]

# Now encode
te = TransactionEncoder()
te_ary = te.fit(transactions_str).transform(transactions_str)
transaction_df = pd.DataFrame(te_ary, columns=te.columns_)


In [15]:
transaction_df.head()


Unnamed: 0,Age_20_25,Age_36_45,Age_46plus,Driving_Exp_0_2,Driving_Exp_3_4,Driving_Exp_5plus,Education level_Below_Metric,Education level_Graduate,Education level_Higher_Secondary,Education level_Postgraduate,Gender_Female,Gender_Male,Income_21k_40k,Income_above_40k,Income_upto20k,Occupation_Businessman,Occupation_Employed,Occupation_Retired,Occupation_Student,nan
0,True,False,False,False,False,True,False,False,False,True,False,True,True,False,False,False,True,False,False,False
1,True,False,False,True,False,False,False,True,False,False,False,True,False,False,True,False,False,False,True,False
2,True,False,False,False,True,False,False,True,False,False,False,True,False,False,True,False,False,False,True,False
3,True,False,False,False,False,True,False,True,False,False,False,True,False,False,True,False,False,False,True,False
4,True,False,False,False,True,False,False,True,False,False,False,True,False,False,True,False,False,False,True,False


In [None]:
from mlxtend.frequent_patterns import apriori, association_rules


In [23]:
# STEP 1: Run the Apriori algorithm to find frequent itemsets
frequent_itemsets = apriori(transaction_df, min_support=0.1, use_colnames=True)

# STEP 2: Generate association rules from frequent itemsets
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

# STEP 3: Add the target variable to identify influential rules
# Assuming you have the target variable (violated or not) in a separate column `target_series`
# and it aligns with transaction_df row-wise:
# Extract the target column as a separate Series
target_series = df["Have you ever crossed the street during a red light?"]
# Add target column to the one-hot encoded transaction_df
transaction_df["target"] = target_series.values

transaction_df["target"] = target_series.values  # make sure target_series is a pandas Series

# STEP 4: Split the transactions into violators and non-violators
violators_df = transaction_df[transaction_df["target"] == 1].drop(columns=["target"])
non_violators_df = transaction_df[transaction_df["target"] == 0].drop(columns=["target"])

# STEP 5: Run apriori separately for violators and non-violators
frequent_violators = apriori(violators_df, min_support=0.1, use_colnames=True)
frequent_non_violators = apriori(non_violators_df, min_support=0.1, use_colnames=True)

# STEP 6: Find combinations that occur frequently in violators but NOT in non-violators
set_non_violators = set(frozenset(itemset) for itemset in frequent_non_violators['itemsets'])
only_in_violators = frequent_violators[~frequent_violators['itemsets'].isin(set_non_violators)]

# STEP 7: Filter for combinations with more than one item (i.e., combination of features)
combinations_becoming_influential = only_in_violators[only_in_violators['itemsets'].apply(lambda x: len(x) > 1)]

# STEP 8: Show the final result
print("Combinations of non-influential features that become influential together:")
print(combinations_becoming_influential.sort_values(by='support', ascending=False))



Combinations of non-influential features that become influential together:
      support                                           itemsets
133  0.270408  (Education level_Higher_Secondary, Occupation_...
67   0.224490            (Occupation_Employed, Income_above_40k)
137  0.204082  (Occupation_Employed, Income_above_40k, Gender...
53   0.178571       (Education level_Graduate, Income_above_40k)
125  0.173469  (Education level_Graduate, Income_above_40k, G...
..        ...                                                ...
115  0.102041  (Education level_Higher_Secondary, Driving_Exp...
124  0.102041  (Education level_Below_Metric, Occupation_Empl...
110  0.102041  (Education level_Below_Metric, Driving_Exp_5pl...
150  0.102041  (Age_36_45, Occupation_Employed, Driving_Exp_3...
143  0.102041  (Age_20_25, Driving_Exp_5plus, Income_upto20k,...

[66 rows x 2 columns]




In [24]:
from IPython.display import display

display(frequent_non_violators)


Unnamed: 0,support,itemsets
0,0.711921,(Age_20_25)
1,0.226821,(Age_36_45)
2,0.276490,(Driving_Exp_0_2)
3,0.380795,(Driving_Exp_3_4)
4,0.342715,(Driving_Exp_5plus)
...,...,...
158,0.100993,"(Income_upto20k, Education level_Graduate, nan..."
159,0.109272,"(Driving_Exp_0_2, Income_upto20k, Age_20_25, E..."
160,0.105960,"(Income_upto20k, Age_20_25, Gender_Male, Drivi..."
161,0.139073,"(Income_upto20k, Age_20_25, Gender_Male, Educa..."
