# Apriori N Grams

This file uses the apriori algorithm to calculate the support, confidence, lift, and relevance (my metric) for every permutation pair in the set of unique n-gram terms from the documents and store the values in a dataframe for future reference.

### Input: NgramIdentification.csv
### Output: Apriori_Ngrams.csv

In [None]:
import pandas as pd
from apyori import apriori

In [None]:
ngram_df = pd.read_csv("NgramIdentification.csv")

In [None]:
# Extract unique terms from the Expanded_Ngrams column
# Assuming the column contains lists stored as strings
ngram_df['Ngrams'] = ngram_df['Ngrams'].apply(lambda x: eval(x) if isinstance(x, str) else x)
unique_terms = set(term for ngrams in ngram_df['Ngrams'] for term in ngrams)

In [None]:
# Prepare transactions for Apriori
transactions = ngram_df['Ngrams'].tolist()

# Perform Apriori analysis using apyori
min_support = 0.001  # Adjust as needed
#min_confidence = 0.01  # Adjust as needed
#min_lift = 1.0  # Adjust as needed
results = list(apriori(transactions, min_support=min_support, max_length=2))

In [None]:
print(results)

In [None]:
# Extract Apriori results into a DataFrame
rules = []

# Process Apriori results
for result in results:
    for ordered_stat in result.ordered_statistics:
        base = list(ordered_stat.items_base)
        add = list(ordered_stat.items_add)

        # Ensure non-empty base and add
        if base and add:
            rules.append({
                "Base": base,  # Extract the single base term
                "Add": add,    # Extract the single add term
                "Support": result.support,
                "Confidence": ordered_stat.confidence,
                "Lift": ordered_stat.lift,
                "Relevance": ordered_stat.confidence * ordered_stat.lift
            })

# Create DataFrame of the rules
rules_df = pd.DataFrame(rules)

In [None]:
rules_df.to_csv("Apriori_Ngrams.csv", index=False)