# Merge with TNCO

In [14]:
import pandas as pd

# --- Load data (adjust paths if needed) ---
tnco_path = "Characteristics/upccig_with_tnco.csv"
hardcoded_path = "unique_upccig_hardcoded.csv"

tnco = pd.read_csv(tnco_path)
hard = pd.read_csv(hardcoded_path)

# --- Standardize merge key ---
for df in (tnco, hard):
    # ensure DESCRIP exists and is clean
    if "DESCRIP" not in df.columns:
        raise KeyError("Expected a 'DESCRIP' column in both dataframes.")
    df["DESCRIP"] = df["DESCRIP"].astype(str).str.strip()

# --- 1) Rename 'Generic' in each df ---
# tnco: Generic -> Generic_automated
if "Generic" in tnco.columns:
    tnco = tnco.rename(columns={"Generic": "Generic_automated"})

# hardcoded: Generic -> Generic_hardcoded
if "Generic" in hard.columns:
    hard = hard.rename(columns={"Generic": "Generic_hardcoded"})

# optional: drop obviously empty helper column if present
for col in ["Unnamed: 1"]:
    if col in hard.columns:
        hard = hard.drop(columns=col)

# --- 2) Drop 'Menthol' from the tnco dataset ---
if "Menthol" in tnco.columns:
    tnco = tnco.drop(columns="Menthol")

tnco = tnco[tnco["DESCRIP"].str.upper() != "CIGARETTES B2G1MUG"]

# --- 3) Merge over DESCRIP ---
merged = tnco.merge(hard, on="DESCRIP", how="left")

# (optional) quick sanity check
print(tnco.shape, hard.shape, merged.shape)
merged.head()

merged.to_csv("Characteristics/upccig_with_tnco_HARD.csv", index=False)


(940, 21) (95, 11) (940, 31)


# Merge clean

In [17]:
import pandas as pd

# --- Load data (adjust paths if needed) ---
tnco_path = "Characteristics/upccig_clean.csv"
hardcoded_path = "unique_upccig_hardcoded.csv"

tnco = pd.read_csv(tnco_path)
hard = pd.read_csv(hardcoded_path)

# --- Standardize merge key ---
for df in (tnco, hard):
    # ensure DESCRIP exists and is clean
    if "DESCRIP" not in df.columns:
        raise KeyError("Expected a 'DESCRIP' column in both dataframes.")
    df["DESCRIP"] = df["DESCRIP"].astype(str).str.strip()

# --- 1) Rename 'Generic' in each df ---
# tnco: Generic -> Generic_automated
if "Generic" in tnco.columns:
    tnco = tnco.rename(columns={"Generic": "Generic_automated"})

# hardcoded: Generic -> Generic_hardcoded
if "Generic" in hard.columns:
    hard = hard.rename(columns={"Generic": "Generic_hardcoded"})

# optional: drop obviously empty helper column if present
for col in ["Unnamed: 1"]:
    if col in hard.columns:
        hard = hard.drop(columns=col)

# --- 2) Drop 'Menthol' from the tnco dataset ---
if "Menthol" in tnco.columns:
    tnco = tnco.drop(columns="Menthol")

tnco = tnco[tnco["DESCRIP"].str.upper() != "CIGARETTES B2G1MUG"]

# --- 3) Merge over DESCRIP ---
merged = tnco.merge(hard, on="DESCRIP", how="left")

# (optional) quick sanity check
print(tnco.shape, hard.shape, merged.shape)
merged.head()

merged.to_csv("Characteristics/upccig_clean_HARD.csv", index=False)


(940, 17) (95, 11) (940, 27)


In [19]:
merged

Unnamed: 0,COM_CODE,UPC,DESCRIP,SIZE,CASE,NITEM,brand,size,pack,Dlx,...,Cigarette,Generic_hardcoded,Value,Cigar,Snuff,Loose tobacco,Flavored,Menthol,Premium,Implied discount
0,700,190,KING CARTON CIGS (PL,1 CT,1,86040,,King,UNK,0,...,1.0,1.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0
1,700,191,100'S CARTON CIGS (P,1 CT,1,86060,,100,UNK,0,...,1.0,1.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0
2,705,193,25 CT SINGLE PACK CI,1 CT,8,80080,,Reg,UNK,0,...,1.0,1.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0
3,705,194,GENERIC SINGLE,1 CT,10,80020,,Reg,UNK,0,...,1.0,1.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0
4,700,195,KING CARTON PRICE VA,1 CT,1,86050,,King,UNK,0,...,1.0,1.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
935,700,8030071013,KING CARTON,10 PK,1,86000,,King,UNK,0,...,1.0,1.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0
936,700,8030084613,SUB GENERIC KING CAR,10 CT,1,86030,,King,UNK,0,...,1.0,1.0,1.0,0.0,0.0,0.0,0,0,0.0,0.0
937,705,8030084699,CIGARETTES (SINGLE P,1 CT,1,86110,,Reg,UNK,0,...,1.0,1.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0
938,705,8640912345,PARODI AMEZZA,5 EA,20,88700,,Reg,UNK,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0,0,1.0,0.0
