# Unimodal Text

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
<img align="left" src="Images\unimodal-Text.png" width="600">

## Imports and set-up

In [1]:
from Utils import load_data, preprocessing, model_performances
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import json
from sklearn.metrics import classification_report

In [3]:
seed = 222
np.random.seed(seed) # numpy seed
tf.random.set_seed(seed) # works for all devices (CPU and GPU)

In [5]:
# ________________________________________Utils ___________________________________________________
if not os.path.exists('./Unimodal/predictions'):
    os.makedirs('./Unimodal/predictions')

if not os.path.exists('./Unimodal/performances'):
    os.makedirs('./Unimodal/performances')

path_models = './Unimodal/models'
file_out = './Unimodal/performances/Text_results_10Fold.txt'
predictions_csv_path = './Unimodal/predictions/Text_pred_10Fold.csv'

file = open(file_out, 'a+')
file.truncate(0)  # erase file content
file.close()

label_column = "misogynous"
input_columns = ['text_USE']
threshold = 0.5

In [6]:
embed_size = 512  # 512-length array with Universal Sentence Encoder algorithm
batch_size = 64
epochs = 100

## Load Data

In [7]:
# ________________________________________load training data ___________________________________________________
meme_df = load_data.load_training_data()

meme_df['text_USE'] = preprocessing.use_preprocessing(meme_df, 'Text Transcription')

In [8]:
meme_df.head()

Unnamed: 0,file_name,misogynous,Text Transcription,text_USE
0,5827.jpg,0,"*Gets on Pornhub* Woman on Porn Ad: ""Are you s...","[0.04223586246371269, -0.028678087517619133, 0..."
1,2454.jpg,0,When your high school girlfriend finally turns...,"[-0.016387682408094406, -0.004176544025540352,..."
2,6492.jpg,1,Me every time I refuse to objectify women I ca...,"[0.011072046123445034, -0.023999786004424095, ..."
3,2054.jpg,1,Verizon Q Search News r/kotakuinaction2 There ...,"[-0.02618846856057644, -0.034125760197639465, ..."
4,5388.jpg,0,me watching a horror movie NO DUDE WASN'T ME s...,"[-0.0059230634942650795, 0.006091502029448748,..."


## 10 Fold on training data

In [9]:
# ________________________________________train model on training data 10Fold________________________________________
kf = KFold(n_splits=10, shuffle=False)

iteration = 0
real_values = np.array([])
predict_values = np.array([])
ids = np.array([])


In [None]:
for train_index, test_index in kf.split(meme_df):  # split into train and test
    preprocessing.set_seed(iteration)
    x_train, y_train, x_val, y_val, x_test, y_test = preprocessing.elaborate_data_10fold(meme_df, 
                                                                               train_index,
                                                                               test_index, 
                                                                               iteration,
                                                                               input_columns, 
                                                                               label_column)
    
    model, history = model_performances.get_trained_model(x_train, 
                            y_train, 
                            x_val, 
                            y_val,
                            input_shape=embed_size, 
                            activation_function='LeakyReLU', 
                            neurons=embed_size/2, 
                            dropout=0.2, 
                            epochs=100)
    
    iteration = iteration + 1

    # un-comment to save each model
    #model_name = path_models + 'unimodal_text_' + str(iteration)
    #model.save(model_name)

    # make prediction on training data
    pred = model.predict(x_test, batch_size=batch_size)

    predict_values = np.append(predict_values, pred)
    real_values = np.append(real_values, y_test)
    ids = np.append(ids, meme_df.iloc[test_index, :]['file_name'].tolist())

    result_df = meme_df.iloc[test_index, [0, 1]]
    result_df['score_col'] = pred

    # write on file
    file = open(file_out, "a+")
    file.write('\n\nITERAZIONE ' + str(iteration) + '\n')
    file.write(json.dumps(model_performances.compute_confusion_rates(result_df, 'score_col', 'misogynous', threshold)))
    file.write('\n') 
    file.write(classification_report(result_df['misogynous'].values, (result_df['score_col']>threshold).astype(int).values, target_names=['not_mis','mis']))
    file.close()

In [15]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 256)               131328    
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 1)                 257       
                                                                 
Total params: 131,585
Trainable params: 131,585
Non-trainable params: 0
_________________________________________________________________


In [14]:
# results dataframe, save predictions
result_df = pd.DataFrame({'id': ids, 'real': real_values.astype(int), 'pred': predict_values})
result_df.to_csv(predictions_csv_path, index=False, sep='\t')

# Overall metrics _ write on file
file = open(file_out, "a+")
file.write('\n\n10 Fold Results ' + str(iteration) + '\n')
file.write(json.dumps(model_performances.compute_confusion_rates(result_df, 'pred', 'real', threshold)))
file.write('\n') 
file.write(classification_report(result_df['real'].values, (result_df['pred']>threshold).astype(int).values, target_names=['not_mis','mis']))
file.write('\n AUC:') 
file.write(str(model_performances.compute_auc(result_df['real'].values, result_df['pred'].values)))
file.close()

In [None]:
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'dev'], loc='upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'dev'], loc='upper left')
plt.show()

## 10 fold on Test data

In [17]:
# ________________________________________Utils ___________________________________________________
if not os.path.exists('./Unimodal/predictionsTest'):
    os.makedirs('./Unimodal/predictionsTest')

if not os.path.exists('./Unimodal/performancesTest'):
    os.makedirs('./Unimodal/performancesTest')

path_models = './Unimodal/modelsTest'
file_out = './Unimodal/performancesTest/Text_results_10Fold.txt'
predictions_csv_path = './Unimodal/predictionsTest/Text_pred_10Fold.csv'

file = open(file_out, 'a+')
file.truncate(0)  # erase file content
file.close()

In [18]:
# Load Test and preprocessing
test_df = load_data.load_test_data()
test_df['text_USE'] = preprocessing.use_preprocessing(test_df, 'Text Transcription')

x_test, y_test = preprocessing.elaborate_input(test_df, input_columns, label_column)


In [19]:
# ________________________________________train model on training data 10Fold________________________________________
kf = KFold(n_splits=10, shuffle=False)

iteration = 0
real_values = np.array([])
predict_values = np.array([])
ids = np.array([])

In [None]:
for train_index, val_index in kf.split(meme_df):  # split into train and test
    preprocessing.set_seed(iteration)
    x_train, y_train = preprocessing.elaborate_input(meme_df.iloc[train_index, :], input_columns, label_column)
    x_val, y_val = preprocessing.elaborate_input(meme_df.iloc[val_index, :], input_columns, label_column)

    model, history = model_performances.get_trained_model(x_train, 
                            y_train, 
                            x_val, 
                            y_val,
                            input_shape=embed_size, 
                            activation_function='LeakyReLU', 
                            neurons=embed_size/2, 
                            dropout=0.2, 
                            epochs=100)

    iteration = iteration + 1

    # make prediction on training data
    pred = model.predict(x_test, batch_size=batch_size)

    predict_values = np.append(predict_values, pred)
    real_values = np.append(real_values, y_test)
    ids = np.append(ids, test_df['file_name'].tolist())

    result_df = test_df[['file_name', 'misogynous']].copy()
    result_df['score_col'] = pred

    print(result_df.columns)
    # write on file
    file = open(file_out, "a+")
    file.write('\n\nITERAZIONE ' + str(iteration) + '\n')
    file.write(json.dumps(model_performances.compute_confusion_rates(result_df, 'score_col', 'misogynous', threshold)))
    file.write('\n')
    file.write('\n') 
    file.write(classification_report(result_df['misogynous'].values, (result_df['score_col']>threshold).astype(int).values, target_names=['not_mis','mis']))
    file.close()


In [None]:
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'dev'], loc='upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'dev'], loc='upper left')
plt.show()

In [23]:
# results dataframe, save predictions
result_df = pd.DataFrame({'id': ids, 'real': real_values.astype(int), 'pred': predict_values})
result_df.to_csv(predictions_csv_path, index=False, sep='\t')

# Overall metrics _ write on file
file = open(file_out, "a+")
file.write('\n\n10 Fold Results ' + str(iteration) + '\n')
file.write(json.dumps(model_performances.compute_confusion_rates(result_df, 'pred', 'real', threshold)))
file.write('\n')
file.write('\n') 
file.write(classification_report(result_df['real'].values, (result_df['pred']>threshold).astype(int).values, target_names=['not_mis','mis']))
file.write('\n AUC:') 
file.write(str(model_performances.compute_auc(result_df['real'].values, result_df['pred'].values)))
file.close()
model_performances.compute_confusion_rates(result_df, 'pred', 'real', threshold)

{'pos_label': 1,
 'tpr': 0.848,
 'tnr': 0.4294,
 'fpr': 0.5706,
 'fnr': 0.15200000000000002,
 'precision': 0.5977724517129565,
 'recall': 0.848,
 'accuracy': 0.6387,
 'f1': 0.7012321177540726,
 'auc': 0.725758}