In [2]:
import time
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import confusion_matrix


In [3]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")

In [14]:
pet_labels = ['Dog', 'Cat', 'Fish', 'Rabbit|Rat|Hamster|guinea pigs|chinchillas|gerbils|mice', 'Chameleons|Anoles|Geckos|Bearded Dragons|Snail', 'Bird']
def find_pet(tweet):
    class_to_category = {
        'Rabbit|Rat|Hamster|guinea pigs|chinchillas|gerbils|mice': 'Small Animals',
        'Chameleons|Anoles|Geckos|Bearded Dragons|Snail': 'Reptiles'
    }
    pet = classifier(tweet, pet_labels, multi_label=True)
    pet_scores = list(map(lambda x: round(x, 2), pet.get('scores')))
    pet_labels_updated = [class_to_category.get(label, label) for label in pet.get('labels')]
    pet_scores_dict = dict(zip(pet_labels_updated, pet_scores))
    print(pet_scores_dict)
    return pet_scores_dict
    print()
    
def get_selected_pets(scores_dict):
    if scores_dict is None or pd.isnull(scores_dict):
        return ''
    else:
        sorted_dict = dict(sorted(scores_dict.items()))
        return ', '.join([key for key, value in sorted_dict.items() if value > 0.4])
    
def get_hashtags(tweet):
    return ', '.join([word for word in tweet.split() if word.startswith('#')])

def multilabeled_one_hot_encoding(pets):
    pet_dict = {'cat': 0, 'dog': 0, 'fish': 0, 'small animals': 0, 'reptiles': 0, 'bird': 0}
    if pets is None or pd.isnull(pets):
        return pet_dict
    for pet in pets.split(','):
        pet = pet.strip().lower()  # remove leading/trailing spaces and convert to lowercase
        if pet in pet_dict:
            pet_dict[pet] = 1
    return pet_dict

In [5]:
# pipe = pipeline(model="facebook/bart-large-mnli")
# pipe("Buy petsmart futures quick! Tsunami of cats incoming!",
#     candidate_labels=['Dog', 'Cat', 'Fish', 'Rabbit', 'Reptile', 'Parrot'],
# )

In [6]:
find_pet("Thanks, friend. That's a great idea!! I'm looking for rabbit poo as we speak!!")

{'Small Animals': 0.04, 'Bird': 0.0, 'Cat': 0.0, 'Dog': 0.0, 'Reptiles': 0.0, 'Fish': 0.0}


{'Small Animals': 0.04,
 'Bird': 0.0,
 'Cat': 0.0,
 'Dog': 0.0,
 'Reptiles': 0.0,
 'Fish': 0.0}

In [7]:
file_name = "Petsmart_Data.csv"

In [8]:
df = pd.read_csv(file_name)

In [10]:
df.drop(df.loc[df["Tweet"].isnull()].index, inplace=True)

In [11]:
df.head()

Unnamed: 0.1,Unnamed: 0,Id,Date,Tweet,Location,Retweet Count,Favorite Count,Total Tweets Count,Sentiment Score,Sentiment,Positive Sentiment Score,Neutral Sentiment Score,Negative Sentiment Score,review_without_stopwords,review_lemmatized,final_tweets,final_tweets_2,final_tweets_3,Topic
0,0,1646622322739015692,2023-04-13,"Quick, hide it before it becomes the next rage...",Connecticut,0,0,1,[ 0.17140907 0.7923502 -0.8316461 ],Neutral,-0.831646,0.79235,0.171409,"Quick, hide becomes next rage Petsmart!",quick hide becomes next rage petsmart!,quick hide becomes next rage petsmart!,quick hide becomes next rage petsmart!,quick hide becomes next rage petsmart!,0
1,1,1646618278700171264,2023-04-13,"I'm at in Pittsburgh, PA",,0,0,1,[-1.7575119 2.3573275 -0.5046378],Neutral,-0.504638,2.357327,-1.757512,"I'm Pittsburgh, PA",i'm pittsburgh pa,i'm pittsburgh pa,i'm pittsburgh,i'm pittsburgh,-1
2,2,1646614603466702863,2023-04-13,harry‚Äôs house shirt sighting in petsmart hello,,0,0,1,[-2.114253 1.7834618 0.2936031],Neutral,0.293603,1.783462,-2.114253,harry‚Äôs house shirt sighting petsmart hello,harry‚Äôs house shirt sighting petsmart hello,harry‚Äôs house shirt sighting hello,harry‚Äôs house shirt sighting hello,harry‚Äôs house shirt sighting hello,-1
3,3,1646613693956857856,2023-04-13,look at this birb i saw yesterday at petsmart....,,0,0,1,[ 2.0628967 -0.26852918 -1.7350017 ],Negative,-1.735002,-0.268529,2.062897,look birb saw yesterday petsmart. he‚Äôs silly,look birb saw yesterday petsmart he‚Äôs silly,look birb saw yesterday he‚Äôs silly,look birb saw yesterday he‚Äôs silly,look birb saw yesterday he‚Äôs silly,6
4,4,1646613038139682817,2023-04-13,üìöüêæ #Mixbook &amp; #PetSmart partner to create ...,California,0,0,1,[-3.38604 0.24371769 3.6288638 ],Positive,3.628864,0.243718,-3.38604,üìöüêæ #Mixbook &amp; #PetSmart partner create exc...,üìöüêæ #mixbook &amp #petsmart partner create excl...,üìöüêæ #mixbook &amp partner create exclusive pet ...,#mixbook &amp partner create exclusive pet col...,#mixbook &amp partner create exclusive pet col...,0


In [None]:
start_time = time.time()
df["Pet Classification Dictionary"] = df["Tweet"].apply(lambda tweet: find_pet(tweet))

In [15]:
df["Pet"] = df["Pet Classification Dictionary"].apply(get_selected_pets)
df["Hashtags"] = df["Tweet"].apply(get_hashtags)
end_time = time.time()
print("Time taken: ",end_time-start_time)

Time taken:  1121.557848930359


In [16]:
predicted_pets = pd.DataFrame()
actual_pets = pd.DataFrame()

In [18]:
predicted_pets[['Predicted Cat','Predicted Dog','Predicted Fish','Predicted Small Animals','Predicted Reptiles','Predicted Bird']] = df['Pet'].apply(multilabeled_one_hot_encoding).apply(pd.Series)
#actual_pets[['Actual Cat','Actual Dog','Actual Fish','Actual Small Animals','Actual Reptiles','Actual Bird']] = df['Manual Labeling'].apply(multilabeled_one_hot_encoding).apply(pd.Series)

In [19]:
df = pd.concat([df, predicted_pets], axis=1)

In [20]:
df

Unnamed: 0.1,Unnamed: 0,Id,Date,Tweet,Location,Retweet Count,Favorite Count,Total Tweets Count,Sentiment Score,Sentiment,...,Topic,Pet Classification Dictionary,Pet,Hashtags,Predicted Cat,Predicted Dog,Predicted Fish,Predicted Small Animals,Predicted Reptiles,Predicted Bird
0,0,1646622322739015692,2023-04-13,"Quick, hide it before it becomes the next rage...",Connecticut,0,0,1,[ 0.17140907 0.7923502 -0.8316461 ],Neutral,...,0,"{'Dog': 0.74, 'Cat': 0.63, 'Small Animals': 0....","Cat, Dog",,1,1,0,0,0,0
1,1,1646618278700171264,2023-04-13,"I'm at in Pittsburgh, PA",,0,0,1,[-1.7575119 2.3573275 -0.5046378],Neutral,...,-1,"{'Fish': 0.03, 'Bird': 0.02, 'Small Animals': ...",,,0,0,0,0,0,0
2,2,1646614603466702863,2023-04-13,harry‚Äôs house shirt sighting in petsmart hello,,0,0,1,[-2.114253 1.7834618 0.2936031],Neutral,...,-1,"{'Dog': 0.18, 'Cat': 0.17, 'Small Animals': 0....",,,0,0,0,0,0,0
3,3,1646613693956857856,2023-04-13,look at this birb i saw yesterday at petsmart....,,0,0,1,[ 2.0628967 -0.26852918 -1.7350017 ],Negative,...,6,"{'Bird': 0.97, 'Dog': 0.01, 'Small Animals': 0...",Bird,,0,0,0,0,0,1
4,4,1646613038139682817,2023-04-13,üìöüêæ #Mixbook &amp; #PetSmart partner to create ...,California,0,0,1,[-3.38604 0.24371769 3.6288638 ],Positive,...,0,"{'Dog': 0.43, 'Cat': 0.42, 'Small Animals': 0....","Cat, Dog","#Mixbook, #PetSmart, #Synergy, #PetLove",1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,1646299289541279744,2023-04-12,"Welp, thx to Kit I now have a mole loose somew...",USA,1,37,2,[-0.65611076 0.5590186 0.23361185],Neutral,...,0,"{'Cat': 0.1, 'Dog': 0.08, 'Small Animals': 0.0...",,,0,0,0,0,0,0
96,96,1646299270906085376,2023-04-12,I skipped the appt I had to give my dog a bath...,,1,9,2,[ 0.7008229 0.2856084 -0.88569814],Negative,...,-1,"{'Dog': 0.98, 'Bird': 0.14, 'Fish': 0.01, 'Rep...",Dog,,0,1,0,0,0,0
97,97,1646299151154413568,2023-04-12,Get an automatic feeder. PetSmart has a model ...,,0,0,1,[-1.9700232 0.6400106 1.4149035],Positive,...,1,"{'Cat': 0.71, 'Bird': 0.15, 'Fish': 0.04, 'Sma...",Cat,,1,0,0,0,0,0
98,98,1646296604041773058,2023-04-12,"We can't stop singing, ""I'd do anything."" üé∂ ‚ò∫Ô∏è...",,0,1,1,[-2.7528543 -0.38209927 3.825477 ],Positive,...,-1,"{'Bird': 0.14, 'Cat': 0.12, 'Fish': 0.04, 'Dog...",,,0,0,0,0,0,0


In [21]:
df.to_csv(file_name)

In [24]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

# create a list of pet categories
pet_categories = ['Cat', 'Dog']

# loop over the pet categories and compute the performance metrics for each
for pet in pet_categories:
    # get the predicted values from the binary column for this pet category
    y_pred = df[f'Predicted {pet}'].values

    # get the ground truth values from the binary column for this pet category
    y_true = df[f'Actual {pet}'].values

    # compute the performance metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred)

    # compute the confusion matrix
    conf_mat = confusion_matrix(y_true, y_pred)

    # print the performance metrics and confusion matrix
    print(f"Performance metrics and confusion matrix for {pet}:")
    print(f"Accuracy: {accuracy:.3f}")
    print(f"Precision: {precision:.3f}")
    print(f"Recall: {recall:.3f}")
    print(f"AUC: {auc:.3f}")
    print(f"Confusion matrix:\n{conf_mat}\n")

Performance metrics and confusion matrix for Cat:
Accuracy: 0.793
Precision: 0.370
Recall: 0.833
AUC: 0.810
Confusion matrix:
[[63 17]
 [ 2 10]]

Performance metrics and confusion matrix for Dog:
Accuracy: 0.804
Precision: 0.634
Recall: 0.897
AUC: 0.829
Confusion matrix:
[[48 15]
 [ 3 26]]



In [23]:
from sklearn.metrics import confusion_matrix

# create a list of pet categories
pet_categories = ['Cat', 'Dog']

# loop over the pet categories and compute the confusion matrix for each
for pet in pet_categories:
    # get the predicted values from the binary column for this pet category
    y_pred = df[f'Predicted {pet}'].values

    # get the ground truth values from the binary column for this pet category
    y_true = df[f'Actual {pet}'].values

    # compute the confusion matrix
    conf_mat = confusion_matrix(y_true, y_pred)

    # print the confusion matrix
    print(f"Confusion matrix for {pet}:\n{conf_mat}\n")

Confusion matrix for Cat:
[[63 17]
 [ 2 10]]

Confusion matrix for Dog:
[[48 15]
 [ 3 26]]



In [43]:
# Create list of probability threshold values to loop over
threshold_list = np.arange(0,1.1,0.1).tolist()
print(threshold_list)
cat_accuracy = []
cat_auc = []
dog_accuracy = []
dog_auc = []
fish_accuracy = []
fish_auc = []
# small_animals_accuracy = []
# small_animals_auc = []
# reptiles_accuracy = []
# reptiles_auc = []
# bird_accuracy = []
# bird_auc = []
    
# Start loop
for threshold in threshold_list:

    # model prediction
    df["Pet"] = df["Pet Classification Dictionary"].apply(get_selected_pets,threshold=threshold)
    
    # one hot encoding for multilabled column
    df[['Predicted Cat','Predicted Dog','Predicted Fish','Predicted Small Animals','Predicted Reptiles','Predicted Bird']] = df['Pet'].apply(multilabeled_one_hot_encoding).apply(pd.Series)

    # create a list of pet categories
    pet_categories = ['Cat', 'Dog', 'Fish']
    #pet_categories = ['Cat', 'Dog', 'Fish', 'Small Animals', 'Reptiles', 'Bird']

    # loop over the pet categories and compute the performance metrics for each
    for pet in pet_categories:
        # get the predicted values from the binary column for this pet category
        y_pred = df[f'Predicted {pet}'].values

        # get the ground truth values from the binary column for this pet category
        y_true = df[f'Actual {pet}'].values

        if(pet == 'Cat'):
            cat_accuracy.append(accuracy_score(y_true, y_pred).round(3))
            cat_auc.append(roc_auc_score(y_true, y_pred).round(3))
            print("Cat accuracy: ",accuracy_score(y_true, y_pred).round(3))
        elif(pet == 'Dog'):
            dog_accuracy.append(accuracy_score(y_true, y_pred).round(3))
            dog_auc.append(roc_auc_score(y_true, y_pred).round(3))
            print("Dog accuracy: ",accuracy_score(y_true, y_pred).round(3))
        elif(pet == 'Fish'):
            fish_accuracy.append(accuracy_score(y_true, y_pred).round(3))
            fish_auc.append(roc_auc_score(y_true, y_pred).round(3))
            print("Fish accuracy: ",accuracy_score(y_true, y_pred).round(3))
#         elif(pet == 'Small Animals'):
#             small_animals_accuracy.append(accuracy_score(y_true, y_pred).round(3))
#             small_animals_auc.append(roc_auc_score(y_true, y_pred).round(3))
#             print("Dog accuracy: ",accuracy_score(y_true, y_pred).round(3))
#         elif(pet == 'Reptiles'):
#             reptiles_accuracy.append(accuracy_score(y_true, y_pred).round(3))
#             reptiles_auc.append(roc_auc_score(y_true, y_pred).round(3))
#             print("Reptiles accuracy: ",accuracy_score(y_true, y_pred).round(3))
#         else:
#             bird_accuracy.append(accuracy_score(y_true, y_pred).round(3))
#             reptiles_auc.append(roc_auc_score(y_true, y_pred).round(3))
#             print("Bird accuracy: ",accuracy_score(y_true, y_pred).round(3))

# Create dataframe
result = pd.DataFrame({"Threshold":threshold_list,
                       "Cat Accuracy": cat_accuracy,
                       "Cat AUC": cat_auc,
                       "Dog Accuracy": dog_accuracy,
                       "Dog AUC": dog_auc,
                       "Fish Accuracy": fish_accuracy,
                       "Fish AUC": fish_auc,
#                        "Small Animals Accuracy": small_animals_accuracy,
#                        "Small Animals AUC": small_animals_auc,
#                        "Reptiles Accuracy": reptiles_accuracy,
#                        "Reptiles AUC": reptiles_auc,
#                        "Bird Accuracy": bird_accuracy,
#                        "Bird AUC": bird_auc
                      })


[0.0, 0.1, 0.2, 0.30000000000000004, 0.4, 0.5, 0.6000000000000001, 0.7000000000000001, 0.8, 0.9, 1.0]
Cat accuracy:  0.402
Dog accuracy:  0.478
Fish accuracy:  0.511
Cat accuracy:  0.663
Dog accuracy:  0.707
Fish accuracy:  0.935
Cat accuracy:  0.772
Dog accuracy:  0.75
Fish accuracy:  0.967
Cat accuracy:  0.848
Dog accuracy:  0.826
Fish accuracy:  0.978
Cat accuracy:  0.88
Dog accuracy:  0.859
Fish accuracy:  0.978
Cat accuracy:  0.913
Dog accuracy:  0.859
Fish accuracy:  0.978
Cat accuracy:  0.902
Dog accuracy:  0.859
Fish accuracy:  0.978
Cat accuracy:  0.902
Dog accuracy:  0.826
Fish accuracy:  0.978
Cat accuracy:  0.88
Dog accuracy:  0.826
Fish accuracy:  0.978
Cat accuracy:  0.87
Dog accuracy:  0.804
Fish accuracy:  0.967
Cat accuracy:  0.87
Dog accuracy:  0.685
Fish accuracy:  0.967


In [44]:
result

Unnamed: 0,Threshold,Cat Accuracy,Cat AUC,Dog Accuracy,Dog AUC,Fish Accuracy,Fish AUC
0,0.0,0.402,0.656,0.478,0.61,0.511,0.747
1,0.1,0.663,0.771,0.707,0.776,0.935,0.644
2,0.2,0.772,0.833,0.75,0.79,0.967,0.661
3,0.3,0.848,0.842,0.826,0.845,0.978,0.667
4,0.4,0.88,0.86,0.859,0.869,0.978,0.667
5,0.5,0.913,0.808,0.859,0.869,0.978,0.667
6,0.6,0.902,0.731,0.859,0.841,0.978,0.667
7,0.7,0.902,0.696,0.826,0.771,0.978,0.667
8,0.8,0.88,0.612,0.826,0.743,0.978,0.667
9,0.9,0.87,0.5,0.804,0.699,0.967,0.5
