In [2]:
import os
import sys
import json
import pandas as pd
from datasets import Dataset, DatasetDict

sys.path.append(os.path.abspath(".."))
from utils import (
    load_data,
    process_labels,
    CustomTokenizer,
    CustomTrainer,
)

In [3]:
# Set up path
excel_fpath = r"D:\Priyanshu\Wysa\dataset.xlsx"
mappings_fpath = r"D:\Priyanshu\Wysa\code\mappings\mapping.json"

### Load Augmented Data

In [4]:
df_aug = pd.read_csv("aug_data.csv")
df_aug.head()

Unnamed: 0,id,category
0,5,iPad App
1,15,iPad
2,31,Android App
3,32,iPad
4,33,iPad


### Clean the Augmented Data

In [5]:
valid_category_list = [
    "iPhone",
    "iPad or iPhone App",
    "iPad",
    "Google",
    "Android",
    "Apple",
    "Android App",
    "Other Google product or service",
    "Other Apple product or service",
]

mapping = {
    "IPad": "iPad",
    "Other Apple Product or Service": "Other Apple product or service",
}

In [6]:
def is_valid_category(category):
    if pd.isna(category):
        return False
    return category.lower() in [item.lower() for item in valid_category_list]

In [7]:
df_aug_cleaned = df_aug[df_aug["category"].apply(is_valid_category)]
df_aug_cleaned.loc[:, "category"] = df_aug_cleaned["category"].replace(mapping)

print(f"No. of augmented samples: {len(df_aug_cleaned)}")
print(f"No. of classes: {len(df_aug_cleaned['category'].unique())}")

No. of augmented samples: 4550
No. of classes: 9


### Merge Augmented Data with Original Data

In [8]:
# Get original data
df_orig = load_data(excel_fpath, "Train")
df_orig.head()

Unnamed: 0,tweet,brand_product_name,emotion_category
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [9]:
# Merge brand product categories
if all(idx in df_orig.index.tolist() for idx in df_aug_cleaned["id"].tolist()):
    df_orig["brand_product_name"].update(df_aug_cleaned["category"])
    df_merged = df_orig.dropna(subset=["brand_product_name"])
    # df_merged.reset_index(drop=True, inplace=True)
else:
    print("Semething wrong...")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_orig["brand_product_name"].update(df_aug_cleaned["category"])


In [10]:
df_merged.isnull().sum()

tweet                 0
brand_product_name    0
emotion_category      0
dtype: int64

### Process Labels

In [11]:
with open(mappings_fpath, "r") as file:
    mappings = json.load(file)
file.close()

In [12]:
df_labeled = process_labels(
    df_merged,
    ["brand_product_name", "emotion_category"],
    [mappings["categories"], mappings["emotions"]],
)
df_labeled.sample(5)

Unnamed: 0,tweet,brand_product_name,emotion_category,brand_product_name_label,emotion_category_label
3172,Props to GSDM and the big G. (google) at #SXSW...,Google,Positive emotion,7,1
4655,At the google dev event surrounded by people m...,Other Google product or service,No emotion toward brand or product,3,2
4927,RT @mention #fsw #sxsw for those of you who wa...,iPad,No emotion toward brand or product,1,2
2594,"Foursquare ups the game, just in time for #SXS...",Other Apple product or service,Positive emotion,5,1
2847,"#SXSW movers &amp; shakers, @mention is publis...",Other Google product or service,No emotion toward brand or product,3,2


### Train Emotion Recognition Model

In [13]:
NUM_EMOTIONS = len(df_labeled["emotion_category_label"].unique())
NUM_EMOTIONS

3

In [14]:
dataset = Dataset.from_pandas(df_labeled[["tweet", "emotion_category_label"]])
dataset = dataset.remove_columns(["__index_level_0__"])
dataset

Dataset({
    features: ['tweet', 'emotion_category_label'],
    num_rows: 6001
})

In [15]:
# Get tokenizer
tokenizer = CustomTokenizer(model_id="bert-base-cased")

# Tokenize the data (tweets)
dataset = tokenizer(docs=dataset, column="tweet")
dataset = dataset.rename_column("emotion_category_label", "labels")
dataset = DatasetDict({"train": dataset})
dataset

Map:   0%|          | 0/6001 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tweet', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 6001
    })
})

In [None]:
# Get trainer
trainer = CustomTrainer(
    train_dataset=dataset["train"],
    tokenizer=tokenizer,
    model_id="bert-base-cased",
    num_classes=NUM_EMOTIONS,
    output_dir=r"D:\Priyanshu\Wysa\checkpoints\emotion_recog",
    num_train_epochs=10,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.trainer = Trainer(


In [None]:
# Train the model
trainer()

### Train Brand/Product Recognition Model

In [25]:
NUM_CATEGORIES = len(df_labeled["brand_product_name_label"].unique())
NUM_CATEGORIES

9

In [23]:
dataset = Dataset.from_pandas(df_labeled[["tweet", "brand_product_name_label"]])
dataset = dataset.remove_columns(["__index_level_0__"])
dataset

Dataset({
    features: ['tweet', 'brand_product_name_label'],
    num_rows: 6001
})

In [24]:
# Get tokenizer
tokenizer = CustomTokenizer(model_id="bert-base-cased")

# Tokenize the data (tweets)
dataset = tokenizer(docs=dataset, column="tweet")
dataset = dataset.rename_column("brand_product_name_label", "labels")
dataset = DatasetDict({"train": dataset})
dataset

Map:   0%|          | 0/6001 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tweet', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 6001
    })
})

In [None]:
# Get trainer
trainer = CustomTrainer(
    train_dataset=dataset["train"],
    tokenizer=tokenizer,
    model_id="bert-base-cased",
    num_classes=NUM_CATEGORIES,
    output_dir=r"D:\Priyanshu\Wysa\checkpts\brand_product_recog",
    num_train_epochs=10,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.trainer = Trainer(


In [None]:
trainer()