In [6]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import IPython
import urllib.request
from urllib.error import HTTPError
from PIL import UnidentifiedImageError
import requests
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"]="3" # get rid of all tensorflow warnings
import tensorflow as tf
from keras.utils import to_categorical
from tqdm import tqdm
from PIL import Image
from os import listdir
from gensim.models import Word2Vec
import datetime as dt
import time
import math
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
import re
import pickle
import string
#### contraction ####
### nltk ###
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from keras.utils import pad_sequences
#### might be wrong



CONTRACTION_MAP = {
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}

word2vec = Word2Vec.load("../models/w2v_150k")
vec_size = 40
max_length = 10


##Davids timestamper


def basic(original_df,keep_timestamp=False):
    """
    Transforms 'time_stamp' column from df into individual components 'year',
    'month','day','weekday','hour','minute'
    """
    df = original_df.copy()

    if 'time_stamp' not in df.columns:
        raise ValueError("df has no column named 'time_stamp'")
    df['time_stamp'] = pd.to_datetime(df['time_stamp'], unit='s')

    df['year'] = df.time_stamp.dt.year
    df['month'] = df.time_stamp.dt.month
    df['day'] = df.time_stamp.dt.day
    df['weekday'] = df.time_stamp.dt.weekday
    df['hour'] = df.time_stamp.dt.hour
    df['minute'] = df.time_stamp.dt.minute

    if keep_timestamp is False:
        df = df.drop(columns='time_stamp')
    return df

def cyclize(original_df):
    """
    Transforms columns named 'month','day','hour','minute' into sin and cos
    cyclic values for use with machine learning models
    """
    df = original_df.copy()

    need_list = ['month','day','hour','minute']
    max_dict = {
        'month':12,
        'day': 31,
        'hour': 23,
        'minute': 59
    }

    for column in need_list:
        if column in df.columns:
            def sin_trans(number):
                return math.sin(number * (2. * math.pi / max_dict[column]))
            def cos_trans(number):
                return math.cos(number * (2. * math.pi / max_dict[column]))
            df['sin_' + column] = df[column].apply(sin_trans)
            df['cos_' + column] = df[column].apply(cos_trans)
            df = df.drop(columns=column, axis=1)

    return df

def encode_weekday(original_df, keep_weekday_column=False):
    """
    OneHotEncodes column from df column named 'weekday'
    """
    df = original_df.copy()

    enc = OneHotEncoder(handle_unknown='ignore')
    df_wkdy = pd.DataFrame(enc.fit_transform(df[['weekday']]).toarray())
    df = pd.concat([df.reset_index(), df_wkdy], axis=1)
    df = df.set_index('index')
    if keep_weekday_column==False:
        df = df.drop('weekday', axis=1)
    return df

def transform_timestamp(original_df):
    """
    Takes 'time_stamp' column from df and returns df preprocessed and
    ready for machine learning
    """
    df = original_df.copy()
    df = basic(df)
    df = cyclize(df)
    df = encode_weekday(df)
    if 'year' in df.columns:
        scaler = MinMaxScaler()
        df['year'] = scaler.fit_transform(df[['year']].copy())
    return df

###Binglins NLP


def count_len(text):
    # add a column to the dataframe, showing the length of each 'title'
    text = text.split(' ')
    length = len(text)
    return length

def preprocessing(text, contraction_mapping=CONTRACTION_MAP):

    # 1. Expand Contractions
    """Expand the contractions in English. e.g. I'm ==> I am"""
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction

    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)

    # 2. Basic Cleaning
    sentence = expanded_text.lower()
    sentence = ''.join(char for char in sentence if not char.isdigit())
    ## punctuation dictionary ##
    my_punc = string.punctuation
    my_punc += '—'
    my_punc += '“”’'
    ############################
    for punctuation in my_punc:
        sentence = sentence.replace(punctuation, '')
    sentence = sentence.strip()

    # 2. Remove Stopwords
    STOPWORDS = set(stopwords.words('english'))
    remove_s = " ".join([word for word in str(sentence).split() if word not in STOPWORDS])

    # 3. Word Tokenize
    word_tokens = word_tokenize(remove_s)

    # 4. Lemmatize
    lemmatizer = WordNetLemmatizer()
    lemmatized_n = [lemmatizer.lemmatize(word,pos='n') for word in word_tokens]
    lemmatized_v = [lemmatizer.lemmatize(word,pos='v') for word in lemmatized_n]
    return lemmatized_v

def embedding(text,word2vec):
    # 5. Embedding
    word2vec,
    wv = word2vec.wv
    to_array = []
    for word in text:
        if word in wv.key_to_index:
            to_array.append(wv[word])
    return np.array(to_array)




#final preprocessor
def preprocess(data):
    df = data
    dataframe=[]

    for i in range(6):
    # get the path/directory
        folder_dir = f"../raw_data/images/category_{i}"
        for images in os.listdir(folder_dir):
            yeet = []
            path = os.path.join(folder_dir, images)
            image = Image.open(path)
            id_, size, upvotes = images.replace(".png", "").split("_")
            yeet.append(id_)
            yeet.append(size)
            arr = np.array(image)
            try:
                A,B,C = arr.shape
                if C == 4:
                    arr = arr[:,:,:3]
                    image = Image.fromarray(arr)
                    image.save(path)
                yeet.append(path)
                yeet.append(i)
                dataframe.append(yeet)
            except ValueError:
                os.remove(path)
    data_arrys =pd.DataFrame(dataframe)
    data_arrys.rename(columns={0 :'id', 1:"size", 2:"image_path", 3:"y_cat"}, inplace=True)
    #merge
    df = pd.merge(data_arrys, df)
    df = transform_timestamp(df)
    ### Add column: length of Title
    df['title_len']=df['title'].apply(count_len)
    ### Preprocessing ###
    df['preprocessing'] = df['title'].apply(lambda sentence: preprocessing(sentence))
    ## Embedding ###

    #word2vec = Word2Vec(sentences=df["preprocessing"], vector_size=vec_size, min_count=10, window=4)####CHANGE DIS
    df['embedding'] = df['preprocessing'].apply(lambda x: embedding(x,word2vec))
    ### Padding ###

    t = pad_sequences(df['embedding'], dtype='float32', padding='post', maxlen=max_length)
    tes = []
    for i in range(t.shape[0]):
        tes.append(t[i])
    df['padding'] = tes

    X_im = df["image_path"]
    df["size"] = df["size"].apply(lambda x : int(x))
    X_im_size = df["size"]
    X_timestep = df[["year", "sin_month", "cos_month", "sin_day", "cos_day", "sin_hour", "cos_hour", "sin_minute","cos_minute", 0, 1, 2, 3, 4, 5, 6]].values
    X_t_size = df["title_len"]


    X_NLP = df["padding"]
    X_NLP =[np.expand_dims(x, axis=0) for x in X_NLP]
    X_NLP = np.array(X_NLP)
    X_NLP = np.concatenate(X_NLP, axis = 0)


    df["y_cat"] = df["y_cat"].astype("string")
    y = df["y_cat"]
    return { "input_Im": X_im, "input_size_im": X_im_size, "input_size_title": X_t_size,"input_timestep":X_timestep,"input_NLP": X_NLP}, y, df


In [14]:
import pandas as pd
import numpy as np

# Multiple Inputs usin https://machinelearningmastery.com/keras-functional-api-deep-learning/
from tensorflow import keras
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Masking
from keras.layers.convolutional import Conv2D
from keras.layers.pooling import MaxPooling2D
from keras.layers import concatenate
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import Normalization


import keras
from keras.applications.resnet import ResNet50, preprocess_input
from keras.preprocessing.image import ImageDataGenerator


#from final_preprocessor import preprocess

BATCH_SIZE = 32

def initialize_model():
    #Image convolution branch
    input_Im = Input(shape=(128,128,3), name="input_Im")
    conv1 = Conv2D(64, kernel_size=(3, 3),activation='relu')(input_Im)
    pool1 = MaxPooling2D(pool_size=(2,2))(conv1)
    conv2 = Conv2D(32, kernel_size=(3, 3),activation='relu')(pool1)
    pool2 = MaxPooling2D(pool_size=(2,2))(conv2)
    conv3 = Conv2D(32, kernel_size=(3, 3),activation='relu')(pool2)
    pool3 = MaxPooling2D(pool_size=(2,2))(conv3)
    conv2 = Conv2D(16, kernel_size=(3, 3),activation='relu')(pool3)
    pool4 = MaxPooling2D(pool_size=(2,2))(conv2)
    flat1 = Flatten()(pool4)

    #image_size branch
    input_size_im = Input(shape=(1,), name="input_size_im")
    hidden1 = Dense(1, activation='relu')(input_size_im)
    flat2 = Flatten()(hidden1)



    initializer = keras.initializers.VarianceScaling(scale=0.1, mode="fan_in", distribution="uniform")
    #NLP branch
    input_NLP = Input(shape = (10,40), name="input_NLP")###dont know dim padded and embedded inputs
    mask = Masking()(input_NLP)
    lstm = LSTM(32, activation = "tanh",kernel_initializer=initializer, kernel_regularizer="l1")(mask)
    dense1 = Dense(20, activation = "relu")(lstm)
    flat3 = Flatten()(dense1)

    #title_size branch
    input_size_title = Input(shape=(1,), name="input_size_title")
    layer1 = Dense(1, activation='relu')(input_size_title)
    flat4 = Flatten()(layer1)


    #normalizer = Normalization()
    #normalizer.adapt("X_train")
    #davids timestep
    input_timestep = Input(shape=(16,),name="input_timestep")#dont know dims
    norm = Normalization()(input_timestep)
    step1 = Dense(32, activation='relu')(norm)
    #drop1 = Dropout(0.3)(step1)
    step2 = Dense(16, activation='relu')(step1)
    #drop2 = Dropout(0.2)(step2)
    step3 = Dense(8, activation='relu')(step2)
    #drop3 = Dropout(0.2)(step3)
    step4 = Dense(4, activation='relu')(step3)
    #drop4 = Dropout(0.2)(step4)
    #drop4 = Dropout(0.1)(step5)
    flat5 = Flatten()(step4)




    #concat them
    merge = concatenate([flat1, flat2, flat3, flat4, flat5])
    final1 = Dense(128, activation='relu')(merge)
    final2 = Dense(64, activation='relu')(final1)
    final2 = Dense(16, activation='relu')(final1)
    output = Dense(6, activation='softmax')(final2)


    #final model
    model = Model(inputs=[input_Im, input_size_im, input_NLP, input_size_title, input_timestep], outputs=output)

    #print(model.summary())
    model.compile(loss='categorical_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])
    return model

datagen = ImageDataGenerator(preprocessing_function=preprocess_input)

def createGenerator(dff, batch_size=BATCH_SIZE):

    # Shuffles the dataframe, and so the batches as well
    dff = dff.sample(frac=1)

    # Shuffle=False is EXTREMELY important to keep order of image and coord
    flow = datagen.flow_from_dataframe(
                                        dataframe=dff,
                                        directory=None,
                                        x_col="image_path",
                                        y_col="y_cat",
                                        batch_size=batch_size,
                                        shuffle=False,
                                        class_mode="categorical",
                                        target_size=(128,128),
                                        seed=42
                                      )
    idx = 0
    n = len(dff) - batch_size
    batch = 0
    while True :
        # Get next batch of images
        X1 = flow.next()
        # idx to reach
        end = idx + X1[0].shape[0]
        # get next batch of lines from df
        X_im_size = dff["size"][idx:end].to_numpy()
        X_timestep = dff[["year", "sin_month", "cos_month", "sin_day", "cos_day", "sin_hour", "cos_hour", "sin_minute","cos_minute", 0, 1, 2, 3, 4, 5, 6]][idx:end].to_numpy()
        X_t_size = dff["title_len"][idx:end].to_numpy()
        X_NLP = dff["padding"][idx:end]
        X_NLP =[np.expand_dims(x, axis=0) for x in X_NLP]
        X_NLP = np.array(X_NLP)
        X_NLP = np.concatenate(X_NLP, axis = 0)
        dff_verif = dff[idx:end]
        # Updates the idx for the next batch
        idx = end
#         print("batch nb : ", batch, ",   batch_size : ", X1[0].shape[0])
        batch+=1
        # Checks if we are at the end of the dataframe
        if idx==len(dff):
#             print("END OF THE DATAFRAME\n")
            idx = 0
        y = X1[1]
        X_im = X1[0]
        # Yields the image, metadata & target batches
        yield { "input_Im": X_im, "input_size_im": X_im_size, "input_size_title": X_t_size,"input_timestep":X_timestep,"input_NLP": X_NLP},y, dff







def train_model( model_name, new = True, old_model = "Model_predictor"):
    if new:
        model = initialize_model()
    else:
        model = keras.models.load_model(f'../models/{old_model}.h5')
        pass

    df = pd.read_csv('data/balanced_35k.csv', index_col=0)
    X_dict, y, df = preprocess(df)
    GENERATOR = createGenerator(df)
    model.fit(
    GENERATOR,
    epochs=100,
    batch_size = 32,
    steps_per_epoch=893,
    workers = 1,
    use_multiprocessing=False,

    #validation_data = GENERATOR_train
    )
    model.save(f'../models/{model_name}')
    return model

train_model("model_test")


Found 30289 validated image filenames belonging to 6 classes.


  X_im_size = dff["id"][idx:end].to_numpy()
  X_t_size = dff["title_len"][idx:end].to_numpy()
  X_NLP = dff["padding"][idx:end]


TypeError: Could not build a TypeSpec for            id      size                                         image_path  \
index                                                                        
9487   yrfl3b   4071000  ../raw_data/images/category_1/yrfl3b_4071000_8...   
11700  ijt17q  12192768  ../raw_data/images/category_2/ijt17q_12192768_...   
15404  kb94iz   1687500  ../raw_data/images/category_3/kb94iz_1687500_9...   
11965  uud0zf  12192768  ../raw_data/images/category_2/uud0zf_12192768_...   
15501  y0hd5m   1432730  ../raw_data/images/category_3/y0hd5m_1432730_6...   
...       ...       ...                                                ...   
5404   bdsvfe  15925248  ../raw_data/images/category_1/bdsvfe_15925248_...   
15426  egypv1   3145728  ../raw_data/images/category_3/egypv1_3145728_7...   
8229   6bjf4t   7990272  ../raw_data/images/category_1/6bjf4t_7990272_1...   
27420  9a1x6n  10912062  ../raw_data/images/category_5/9a1x6n_10912062_...   
914    ah27oz  12918992  ../raw_data/images/category_0/ah27oz_12918992_...   

      y_cat                                              title  \
index                                                            
9487      1                   my dog likes to stare at the sun   
11700     2         He may be a old boy, but he’s my best boy.   
15404     3  Chloe looking like she’s about to sell you lif...   
11965     2                               Abbie the party pup.   
15501     3      My favorite old boy Sabre the Dogo Argentino.   
...     ...                                                ...   
5404      1                                 Djuka the girl 😊😊😊   
15426     3      My Elder pirate watching for stupid squirrels   
8229      1                                        Silly Mamas   
27420     5  Love this cheeky little Kelpie pup. One out of...   
914       0                                    Winter car ride   

                                 Image_url  upvote_ratio  upvotes  upvote_cat  \
index                                                                           
9487   https://i.redd.it/44855v0v66z91.jpg          0.91        8           1   
11700  https://i.redd.it/8q3h81l49ak51.jpg          0.95       24           2   
15404  https://i.redd.it/pfof1l3cxl461.jpg          0.98       92           3   
11965  https://i.redd.it/svnovuc0jq091.jpg          0.91       20           2   
15501  https://i.redd.it/l05uzjzgwzs91.jpg          0.95       60           3   
...                                    ...           ...      ...         ...   
5404        http://i.imgur.com/uRYmt3T.jpg          0.88       11           1   
15426  https://i.redd.it/sn9w1n3vug741.jpg          0.98       72           3   
8229    https://i.redd.it/5gxj1rcvrwxy.jpg          0.92       11           1   
27420      https://i.imgur.com/9FGTbJp.jpg          0.98     2441           5   
914          https://imgur.com/1xaOpHE.jpg          1.00        1           0   

           year  ...    1    2    3    4    5    6  title_len  \
index            ...                                            
9487   1.000000  ...  0.0  0.0  1.0  0.0  0.0  0.0          8   
11700  0.833333  ...  0.0  0.0  0.0  0.0  0.0  0.0         11   
15404  0.833333  ...  0.0  0.0  0.0  1.0  0.0  0.0         10   
11965  1.000000  ...  0.0  0.0  0.0  0.0  1.0  0.0          4   
15501  1.000000  ...  0.0  0.0  0.0  0.0  0.0  0.0          8   
...         ...  ...  ...  ...  ...  ...  ...  ...        ...   
5404   0.750000  ...  1.0  0.0  0.0  0.0  0.0  0.0          4   
15426  0.750000  ...  0.0  0.0  0.0  0.0  0.0  1.0          7   
8229   0.583333  ...  1.0  0.0  0.0  0.0  0.0  0.0          2   
27420  0.666667  ...  0.0  0.0  0.0  1.0  0.0  0.0         22   
914    0.750000  ...  0.0  0.0  1.0  0.0  0.0  0.0          3   

                                           preprocessing  \
index                                                      
9487                             [dog, like, stare, sun]   
11700                     [may, old, boy, he, best, boy]   
15404   [chloe, look, like, shes, sell, life, insurance]   
11965                                [abbie, party, pup]   
15501       [favorite, old, boy, sabre, dogo, argentino]   
...                                                  ...   
5404                                  [djuka, girl, 😊😊😊]   
15426           [elder, pirate, watch, stupid, squirrel]   
8229                                       [silly, mama]   
27420  [love, cheeky, little, kelpie, pup, one, litte...   
914                                  [winter, car, ride]   

                                               embedding  \
index                                                      
9487   [[0.35266894, -0.44480756, 1.2790529, 0.226411...   
11700  [[0.82692224, -0.22039042, -0.8130363, 0.61339...   
15404  [[-0.055271313, -0.13597453, -0.58476824, 0.03...   
11965  [[0.45161363, -0.4011149, 0.20811391, -0.53929...   
15501  [[0.2762756, 2.1512673, -0.026870538, -0.86067...   
...                                                  ...   
5404   [[1.2286396, -1.3288174, -0.24003081, -0.24175...   
15426  [[-0.11328321, -0.026498873, -0.14387736, 0.14...   
8229   [[-0.6187335, 0.494518, -0.12809905, 0.0371877...   
27420  [[0.7073981, -0.1938361, -0.17172599, -0.85855...   
914    [[0.45897824, -0.6914342, -0.65909344, -0.4955...   

                                                 padding  
index                                                     
9487   [[0.35266894, -0.44480756, 1.2790529, 0.226411...  
11700  [[0.82692224, -0.22039042, -0.8130363, 0.61339...  
15404  [[-0.055271313, -0.13597453, -0.58476824, 0.03...  
11965  [[0.45161363, -0.4011149, 0.20811391, -0.53929...  
15501  [[0.2762756, 2.1512673, -0.026870538, -0.86067...  
...                                                  ...  
5404   [[1.2286396, -1.3288174, -0.24003081, -0.24175...  
15426  [[-0.11328321, -0.026498873, -0.14387736, 0.14...  
8229   [[-0.6187335, 0.494518, -0.12809905, 0.0371877...  
27420  [[-0.33520874, -0.34282863, -0.87791234, 0.106...  
914    [[0.45897824, -0.6914342, -0.65909344, -0.4955...  

[30289 rows x 29 columns] of unsupported type <class 'pandas.core.frame.DataFrame'>.

In [5]:
df = pd.read_csv('data/balanced_35k.csv', index_col=0)
X_dict, y, df = preprocess(df)
df.to_csv('data/processed df.csv')

<class 'pandas.core.series.Series'>
Int64Index: 30289 entries, 0 to 30288
Series name: size
Non-Null Count  Dtype
--------------  -----
30289 non-null  int64
dtypes: int64(1)
memory usage: 473.3 KB
None


In [7]:
df

Unnamed: 0_level_0,id,size,image_path,y_cat,title,Image_url,upvote_ratio,upvotes,upvote_cat,year,...,1,2,3,4,5,6,title_len,preprocessing,embedding,padding
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,si6nl6,12192768,../raw_data/images/category_0/si6nl6_12192768_...,0,My derpy girl giving her best smile!,https://i.redd.it/hbbsqh4dlaf81.jpg,1.00,1,0,1.000000,...,1.0,0.0,0.0,0.0,0.0,0.0,7,"[derpy, girl, give, best, smile]","[[-0.7024669, 0.10465372, -0.10564256, 0.24819...","[[-0.7024669, 0.10465372, -0.10564256, 0.24819..."
1,g92pmt,4802000,../raw_data/images/category_0/g92pmt_4802000_1...,0,"Hi, can everyone hear me?",https://imgur.com/rUlJ8tp.jpg,1.00,1,0,0.833333,...,0.0,0.0,0.0,0.0,0.0,0.0,5,"[hi, everyone, hear]","[[-2.284348, 0.698452, -0.954938, 0.1315268, 0...","[[-2.284348, 0.698452, -0.954938, 0.1315268, 0..."
2,pebvge,8911728,../raw_data/images/category_0/pebvge_8911728_1...,0,My baby 💙,https://i.redd.it/ahw384qsmfk71.jpg,1.00,1,0,0.916667,...,0.0,0.0,0.0,0.0,0.0,0.0,3,"[baby, 💙]","[[-0.15399729, -0.40509552, -0.64985436, -0.13...","[[-0.15399729, -0.40509552, -0.64985436, -0.13..."
3,pdvq6k,12000000,../raw_data/images/category_0/pdvq6k_12000000_...,0,This is the best pic i have with my dog and i ...,https://i.redd.it/yvsw7ijjzak71.jpg,1.00,1,0,0.916667,...,0.0,0.0,0.0,0.0,0.0,1.0,22,"[best, pic, dog, also, post, elsewhere, spread...","[[1.3253559, 0.38933644, 0.7680881, -1.9209212...","[[1.3253559, 0.38933644, 0.7680881, -1.9209212..."
4,bsc52h,2742336,../raw_data/images/category_0/bsc52h_2742336_1...,0,Can you tell she likes “rides” ?,https://i.redd.it/jplh6vc5x2031.jpg,1.00,1,0,0.750000,...,0.0,0.0,0.0,1.0,0.0,0.0,7,"[tell, like, ride]","[[-0.39929333, 0.03881802, -0.06807098, -0.325...","[[-0.39929333, 0.03881802, -0.06807098, -0.325..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30284,qcw42m,12192768,../raw_data/images/category_5/qcw42m_12192768_...,5,"It’s my birthday, I’m 3!",https://i.redd.it/g5sg320l2uu71.jpg,0.99,1629,5,0.916667,...,0.0,0.0,1.0,0.0,0.0,0.0,5,"[birthday, im]","[[2.2755084, 0.8114666, -1.2593371, 0.95703113...","[[2.2755084, 0.8114666, -1.2593371, 0.95703113..."
30285,ggb9cw,12979200,../raw_data/images/category_5/ggb9cw_12979200_...,5,This is my doggy-dog Aya,https://i.redd.it/ij7s5gyb2px41.jpg,1.00,621,5,0.833333,...,0.0,0.0,0.0,0.0,1.0,0.0,5,"[doggydog, aya]","[[-0.32109588, -0.2716897, -0.7240568, 0.18267...","[[-0.32109588, -0.2716897, -0.7240568, 0.18267..."
30286,m4je4e,6502860,../raw_data/images/category_5/m4je4e_6502860_8...,5,One of my favourite pictures of our little girl,https://i.redd.it/ezbjnkwg0wm61.jpg,1.00,834,5,0.916667,...,0.0,0.0,0.0,0.0,0.0,1.0,9,"[one, favourite, picture, little, girl]","[[0.19927058, 2.0016553, -0.5724674, 0.3240424...","[[0.19927058, 2.0016553, -0.5724674, 0.3240424..."
30287,738bkx,2073600,../raw_data/images/category_5/738bkx_2073600_7...,5,"Reddit, say hi to Gus",https://i.redd.it/db0wbl946uoz.jpg,0.99,723,5,0.583333,...,0.0,0.0,0.0,1.0,0.0,0.0,5,"[reddit, say, hi, gu]","[[-0.8507807, 1.2658457, -0.4720916, 0.5673523...","[[-0.8507807, 1.2658457, -0.4720916, 0.5673523..."


In [15]:
GENERATOR = createGenerator(df)

In [16]:
i = 0
for g in GENERATOR:
    print(g[0][ "input_size_im"], g[0]["input_size_title"])
    print(g[2])
    break

Found 30289 validated image filenames belonging to 6 classes.
['uw0k0x' 'c58ibi' 'g6zfl7' 'em1fsi' 'csjwb9' 'jco6p4' '8a4kzw' 'qj2aug'
 'pfwhx8' 'cg3nfn' 'affoba' 'n1wt3l' 'cq5k14' 'vl15lx' 'ytrvv0' 'ad78y0'
 'swlm4y' '7hvhi9' 'xfjfnt' '7jcucc' 'lia9us' 'lhmc0p' 'ghxpx8' '6lcrab'
 'clrwli' 'nfp477' 'fns8u8' 'dc9ye1' 'kw1ksh' '6cfgea' 'xht5xh' '82lqqh'] [ 1  5 55  5  3  9  3  8 10  5  5  9  6  3  1  6 13  5  4  6  5  7  5  6
 15 12 12 14  6  4  2 22]
           id      size                                         image_path  \
index                                                                        
2842   uw0k0x    758990  ../raw_data/images/category_0/uw0k0x_758990_1.png   
20506  c58ibi  12192768  ../raw_data/images/category_4/c58ibi_12192768_...   
29416  g6zfl7  12192768  ../raw_data/images/category_5/g6zfl7_12192768_...   
25361  em1fsi  12192768  ../raw_data/images/category_4/em1fsi_12192768_...   
9051   csjwb9   6670729  ../raw_data/images/category_1/csjwb9_6670729_1...   


  X_im_size = dff["id"][idx:end].to_numpy()
  X_t_size = dff["title_len"][idx:end].to_numpy()
  X_NLP = dff["padding"][idx:end]
