In [21]:
!pip3 install -q pandas numpy scikit-learn sentence-transformers
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures


purchases_df = pd.read_csv("purchases.csv")
## lineItems_df = pd.read_csv("lineItems.csv") not being used yet




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m


## PRE-PROCESSING OF CSVS

In [None]:


# this is for analysis against amounts of other categories 

categories = purchases_df["Category"].dropna().unique().tolist()

# sentence transformer to find relations between categories
model = SentenceTransformer("all-MiniLM-L6-v2")

# embeddings for each category string
cat_embeddings = model.encode(categories, convert_to_numpy=True)

# cosine similarity matrix between all categories
sim_matrix = cosine_similarity(cat_embeddings)

## what is the threshold for being similar as categories?
similarity_threshold = 0.6

related_groups = {}
for i, cat in enumerate(categories):
    related_idx = [
        j for j in range(len(categories)) 
        if sim_matrix[i, j] >= similarity_threshold
    ]
    related_cats = [categories[j] for j in related_idx]
    related_groups[cat] = related_cats

# stats over the related categories for each category
rel_stats = {}
for cat, rel_cats in related_groups.items():
    mask = purchases_df["Category"].isin(rel_cats)
    amounts = purchases_df.loc[mask, "Amount"]
    rel_stats[cat] = {
        "rel_mean": amounts.mean(),
        "rel_std": amounts.std()
    }

purchases_df["rel_mean"] = purchases_df["Category"].map(lambda c: rel_stats[c]["rel_mean"])
purchases_df["rel_std"]  = purchases_df["Category"].map(lambda c: rel_stats[c]["rel_std"])

purchases_df["rel_std"] = purchases_df["rel_std"].replace(0, np.nan)

# Z-score of amount relative to "semantically related" category group
purchases_df["z_amount_related"] = (
    (purchases_df["Amount"] - purchases_df["rel_mean"]) / purchases_df["rel_std"]
)
purchases_df["z_amount_related"] = purchases_df["z_amount_related"].fillna(0.0)


global_q90 = purchases_df["Amount"].quantile(0.90)
global_q95 = purchases_df["Amount"].quantile(0.95)

purchases_df["is_large_global"] = (purchases_df["Amount"] > global_q90).astype(int)
purchases_df["is_very_large_global"] = (purchases_df["Amount"] > global_q95).astype(int)


if "label" in purchases_df.columns:
    purchases_df["y_label"] = purchases_df["label"].map({"typical": 0, "extraneous": 1})


# seasonality analysis

purchases_df["Date"] = pd.to_datetime(purchases_df["Date"], format="ISO8601")
purchases_df["Month"] = purchases_df["Date"].dt.month
purchases_df["DayOfWeek"] = purchases_df["Date"].dt.dayofweek
purchases_df["YearMonth"] = purchases_df["Date"].dt.to_period("M").astype(str)

purchases_df["month_cat_count"] = purchases_df.groupby(["YearMonth","Category"])["Transaction_ID"].transform("count")
purchases_df["month_total"] = purchases_df.groupby("YearMonth")["Transaction_ID"].transform("count")

purchases_df["month_cat_share"] = purchases_df["month_cat_count"] / purchases_df["month_total"]
purchases_df["category_freq"] = purchases_df.groupby("Category")["Transaction_ID"].transform("count")

feature_cols = [
    "Amount",
    "z_amount_related",
    "is_large_global",
    "is_very_large_global",
    "month_cat_share",
    "category_freq",
    ### MISSING THE SEASONAL FUNCTIONALITY
    
]

purchases_model = purchases_df.dropna(subset=["y_label"]).copy()

x = purchases_model[feature_cols].values 
y = purchases_model["y_label"].values  


x_train, x_rest, y_train, y_rest = train_test_split(x, y, test_size=0.2, random_state=42)
x_test, x_val, y_test, y_val = train_test_split(x_rest, y_rest, test_size=0.5, random_state=42)


scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_val = scaler.transform(x_val)
x_test = scaler.transform(x_test)




## Find the best poly fit 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA


poly = PolynomialFeatures(degree=1, include_bias=False)
x_train_poly = poly.fit_transform(x_train)
x_val_poly   = poly.transform(x_val)

    
clf = LogisticRegression(max_iter=10000)
clf.fit(x_train_poly, y_train)


[0.9865196078431373, 0.991421568627451, 0.9926470588235294, 0.9963235294117647, 0.9963235294117647]
[0.9901960784313726, 0.9901960784313726, 0.9803921568627451, 0.9803921568627451, 0.9901960784313726]


## Save coefficients


In [30]:
import joblib

artifacts = {
    "model": clf,
    "scaler": scaler,
    "poly": poly,
    "feature_cols": feature_cols,
    "rel_stats": rel_stats,
    "global_q90": global_q90,
    "global_q95": global_q95,
}

joblib.dump(artifacts, "extraneous_model.pkl")

['extraneous_model.pkl']