# Image Tag-based

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
<img align="left" src="Images\unimodal-tags.png" width="600">

## Imports and set-up

In [1]:
from Utils import load_data, preprocessing, model_performances
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
import json
from sklearn.metrics import classification_report

In [2]:
seed = 222
np.random.seed(seed) # numpy seed
tf.random.set_seed(seed) # works for all devices (CPU and GPU)

In [3]:
# ________________________________________Utils ___________________________________________________
if not os.path.exists('./Unimodal/predictions'):
    os.makedirs('./Unimodal/predictions')

if not os.path.exists('./Unimodal/performances'):
    os.makedirs('./Unimodal/performances')

path_models = './Unimodal/models'
file_out = './Unimodal/performances/Tags_results_10Fold.txt'
predictions_csv_path = './Unimodal/predictions/Tags_pred_10Fold.csv'

file = open(file_out, 'a+')
file.truncate(0)  # erase file content
file.close()

label_column = "misogynous"
input_columns = ['clarifai_USE']
threshold = 0.5

In [4]:
embed_size = 512  # 512-length array with Universal Sentence Encoder algorithm
batch_size = 64
epochs = 100

In [5]:
identity_tag_path = './Data/Categories.txt'
with open(identity_tag_path, 'r') as fd:
    for line in fd:
        tmp = line.lower().split('], ')[0].replace("'", "").strip('][').split(', ')

In [6]:
# ________________________________________load training data ___________________________________________________
meme_df = load_data.load_clarifai_training_data()

Load list of category and compute their embedding

In [7]:
category_embedding=pd.DataFrame()
category_embedding['tag']= tmp
category_embedding

Unnamed: 0,tag
0,animal
1,broom
2,car
3,cartoon
4,cat
5,dog
6,child
7,crockery
8,dishwasher
9,kitchen


In [8]:
category_embedding['tags_USE'] = preprocessing.use_preprocessing(category_embedding, 'tag')
category_embedding

Unnamed: 0,tag,tags_USE
0,animal,"[0.06068221479654312, -0.02071821503341198, 0...."
1,broom,"[-0.016821589320898056, -0.01808616705238819, ..."
2,car,"[0.05287287384271622, -0.06116773560643196, 0...."
3,cartoon,"[0.05937027931213379, 0.012805067002773285, 0...."
4,cat,"[0.047420237213373184, -0.07901598513126373, 0..."
5,dog,"[0.01757180131971836, -0.051691826432943344, 0..."
6,child,"[0.050464827567338943, -0.025343136861920357, ..."
7,crockery,"[-0.03392157331109047, -0.05550631135702133, -..."
8,dishwasher,"[-0.03311099857091904, -0.07485399395227432, 0..."
9,kitchen,"[-0.02044772543013096, -0.037095051258802414, ..."


Load data, for each meme, its embedding is the mean of its tag embedding

In [17]:
# ________________________________________load training data ___________________________________________________
meme_df = load_data.load_clarifai_training_data()


In [18]:
# clarifai_USE is the mean of the tags embeddings
meme_df['clarifai_USE']=0
meme_df['clarifai_USE']=preprocessing.meme_tag_embedding(meme_df, 'clarifai')


In [19]:
meme_df

Unnamed: 0,file_name,misogynous,Text Transcription,clarifai,clarifai_USE
0,5827.jpg,0,"*Gets on Pornhub* Woman on Porn Ad: ""Are you s...",cartoon child man,"[0.05812954778472582, -0.019499183321992557, 0..."
1,2454.jpg,0,When your high school girlfriend finally turns...,cartoon man woman,"[0.05792989581823349, -0.02607914184530576, 0...."
2,6492.jpg,1,Me every time I refuse to objectify women I ca...,woman,"[0.04986587166786194, -0.04508301243185997, -0..."
3,2054.jpg,1,Verizon Q Search News r/kotakuinaction2 There ...,cartoon man woman,"[0.05792989581823349, -0.02607914184530576, 0...."
4,5388.jpg,0,me watching a horror movie NO DUDE WASN'T ME s...,animal cartoon,"[0.060026247054338455, -0.003956574015319347, ..."
...,...,...,...,...,...
9995,6179.jpg,1,LMAObama.com GET UP OFFA THAT THING,woman,"[0.04986587166786194, -0.04508301243185997, -0..."
9996,11354.jpg,1,QUO 74 BUCKEYES THE OHIO STATE Where the hotti...,woman,"[0.04986587166786194, -0.04508301243185997, -0..."
9997,3073.jpg,1,FEEL LIKE Doin' Some Plowin? memecenter.com Me...,woman nudity,"[0.04969446733593941, -0.03920779190957546, -0..."
9998,13096.jpg,1,BLOND WONDERING WHY SHE ONLY HAS THREE SISTERS...,woman,"[0.04986587166786194, -0.04508301243185997, -0..."


## 10 Fold on training data

In [12]:
# ________________________________________train model on training data 10Fold________________________________________
kf = KFold(n_splits=10, shuffle=False)

iteration = 0
real_values = np.array([])
predict_values = np.array([])
ids = np.array([])


In [None]:
for train_index, test_index in kf.split(meme_df):  # split into train and test
    preprocessing.set_seed(iteration)
    x_train, y_train, x_val, y_val, x_test, y_test = preprocessing.elaborate_data_10fold(meme_df,
                                                                                        train_index,
                                                                                        test_index,
                                                                                        iteration,
                                                                                        input_columns,
                                                                                        label_column)
    model, history = model_performances.get_trained_model(x_train, 
                            y_train, 
                            x_val, 
                            y_val,
                            input_shape=embed_size, 
                            activation_function='LeakyReLU', 
                            neurons=embed_size/2, 
                            dropout=0.2, 
                            epochs=100)

    iteration = iteration + 1


    # make prediction on training data
    pred = model.predict(x_test, batch_size=batch_size)

    predict_values = np.append(predict_values, pred)
    real_values = np.append(real_values, y_test)
    ids = np.append(ids, meme_df.iloc[test_index, :]['file_name'].tolist())

    result_df = meme_df.iloc[test_index, [0, 1]]
    result_df['score_col'] = pred

    # write on file
    file = open(file_out, "a+")
    file.write('\n\nITERAZIONE ' + str(iteration) + '\n')
    file.write(json.dumps(model_performances.compute_confusion_rates(result_df, 'score_col', 'misogynous', threshold)))
    file.write('\n') 
    file.write(classification_report(result_df['misogynous'].values, (result_df['score_col']>threshold).astype(int).values, target_names=['not_mis','mis']))
    file.close()

In [15]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 256)               131328    
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 1)                 257       
                                                                 
Total params: 131,585
Trainable params: 131,585
Non-trainable params: 0
_________________________________________________________________


In [14]:
# results dataframe, save predictions
result_df = pd.DataFrame({'id': ids, 'real': real_values.astype(int), 'pred': predict_values})
result_df.to_csv(predictions_csv_path, index=False, sep='\t')

# Overall metrics _ write on file
file = open(file_out, "a+")
file.write('\n\n10 Fold Results ' + str(iteration) + '\n')
file.write(json.dumps(model_performances.compute_confusion_rates(result_df, 'pred', 'real', threshold)))
file.write('\n') 
file.write(classification_report(result_df['real'].values, (result_df['pred']>threshold).astype(int).values, target_names=['not_mis','mis']))
file.write('\n AUC:') 
file.write(str(model_performances.compute_auc(result_df['real'].values, result_df['pred'].values)))
file.close()

In [None]:
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'dev'], loc='upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'dev'], loc='upper left')
plt.show()

## 10 fold on Test data

In [8]:
# ________________________________________Utils ___________________________________________________
if not os.path.exists('./Unimodal/predictionsTest'):
    os.makedirs('./Unimodal/predictionsTest')

if not os.path.exists('./Unimodal/performancesTest'):
    os.makedirs('./Unimodal/performancesTest')

path_models = './Unimodal/modelsTest'
file_out = './Unimodal/performancesTest/Tags_results_10Fold.txt'
predictions_csv_path = './Unimodal/predictionsTest/Tags_pred_10Fold.csv'

file = open(file_out, 'a+')
file.truncate(0)  # erase file content
file.close()

In [9]:
# Load Test and preprocessing
test_df = load_data.load_clarifai_test_data()

In [11]:
# clarifai_USE is the mean of the tags embeddings
test_df['clarifai_USE']=0
test_df['clarifai_USE']=preprocessing.meme_tag_embedding(test_df, 'clarifai')
test_df

Unnamed: 0,file_name,Text Transcription,misogynous,shaming,stereotype,objectification,violence,clarifai,clarifai_USE
0,15190.jpg,ASIAN GIRLS GMPIREHELENAHARPER.DEVIANTART.COM ...,1,0,1,1,0,woman,"[0.04986587166786194, -0.04508301243185997, -0..."
1,15986.jpg,That Guy Really Likes This Show. VERY DEMOTIVA...,1,0,1,1,0,child man woman,"[0.05496141190330187, -0.038795209800203644, 0..."
2,15005.jpg,A MAN WITH DREAMS... NEEDS A WOMAN WITH VISION,0,0,0,0,0,man woman,"[0.05720970407128334, -0.045521246269345284, 0..."
3,15923.jpg,CLAIMS TO LOVE CLEANING DOESN'T EVEN CLEAN DOT...,0,0,0,0,0,cartoon,"[0.05937027931213379, 0.012805067002773285, 0...."
4,15971.jpg,Women Women Women 0 Respect,1,0,1,0,1,man woman,"[0.05720970407128334, -0.045521246269345284, 0..."
...,...,...,...,...,...,...,...,...,...
995,15964.jpg,"YEAH, I HAVE A GIRLFRIEND. BUT SHE'S BEING A B...",1,0,1,1,0,man,"[0.06455353647470474, -0.0459594801068306, 0.0..."
996,15990.jpg,PRAY FOR MY GIRLFRIEND SHE'S USED THE DOG SNAP...,1,1,1,1,1,woman,"[0.04986587166786194, -0.04508301243185997, -0..."
997,15461.jpg,HOW TO HAVE FUN WITH A MINOR Fung Step 3-> Ste...,1,0,0,1,1,child woman,"[0.05016534961760044, -0.03521307464689016, 0...."
998,16018.jpg,this could be us but you ain't here Calvin Kle...,0,0,0,0,0,man woman,"[0.05720970407128334, -0.045521246269345284, 0..."


In [14]:
x_test, y_test = preprocessing.elaborate_input(test_df, input_columns, label_column)


In [15]:
# ________________________________________train model on training data 10Fold________________________________________
kf = KFold(n_splits=10, shuffle=False)

iteration = 0
real_values = np.array([])
predict_values = np.array([])
ids = np.array([])

In [None]:
for train_index, val_index in kf.split(meme_df):  # split into train and test
    preprocessing.set_seed(iteration)
    x_train, y_train = preprocessing.elaborate_input(meme_df.iloc[train_index, :], input_columns, label_column)
    x_val, y_val = preprocessing.elaborate_input(meme_df.iloc[val_index, :], input_columns, label_column)

    model, history = model_performances.get_trained_model(x_train, 
                            y_train, 
                            x_val, 
                            y_val,
                            input_shape=embed_size, 
                            activation_function='LeakyReLU', 
                            neurons=embed_size/2, 
                            dropout=0.2, 
                            epochs=100)
    iteration = iteration + 1

    # make prediction on training data
    pred = model.predict(x_test, batch_size=batch_size)

    predict_values = np.append(predict_values, pred)
    real_values = np.append(real_values, y_test)
    ids = np.append(ids, test_df['file_name'].tolist())

    result_df = test_df[['file_name', 'misogynous']].copy()
    result_df['score_col'] = pred

    print(result_df.columns)
    # write on file
    file = open(file_out, "a+")
    file.write('\n\nITERAZIONE ' + str(iteration) + '\n')
    file.write(json.dumps(model_performances.compute_confusion_rates(result_df, 'score_col', 'misogynous', threshold)))
    file.write('\n') 
    file.write(classification_report(result_df['misogynous'].values, (result_df['score_col']>threshold).astype(int).values, target_names=['not_mis','mis']))
    file.close()


In [None]:
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'dev'], loc='upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'dev'], loc='upper left')
plt.show()

In [22]:
# results dataframe, save predictions
result_df = pd.DataFrame({'id': ids, 'real': real_values.astype(int), 'pred': predict_values})
result_df.to_csv(predictions_csv_path, index=False, sep='\t')

# Overall metrics _ write on file
file = open(file_out, "a+")
file.write('\n\n10 Fold Results ' + str(iteration) + '\n')
file.write(json.dumps(model_performances.compute_confusion_rates(result_df, 'pred', 'real', threshold)))
file.write('\n') 
file.write(classification_report(result_df['real'].values, (result_df['pred']>threshold).astype(int).values, target_names=['not_mis','mis']))
file.write('\n AUC:') 
file.write(str(model_performances.compute_auc(result_df['real'].values, result_df['pred'].values)))
file.close()
file.close()
model_performances.compute_confusion_rates(result_df, 'pred', 'real', threshold)

{'tpr': 0.708,
 'tnr': 0.4936,
 'fpr': 0.5064,
 'fnr': 0.29200000000000004,
 'precision': 0.5830039525691699,
 'recall': 0.708,
 'accuracy': 0.6008,
 'f1': 0.6394508670520231,
 'auc': 0.6521670799999999}