## Vectorization

The python library fasttext is used in combination with a pre trained model available from their website to translate the title strings in the dataset to word vector representations.

To make the process of translation quicker, a vocablulatry set of all words in the data set is created, these are then translated to vector representations. Vocablulary and translation form a lookup table which is then used to add a word vector column to the data set.

In [2]:
import pandas as pd
import os
import urllib.request
import time
from urllib.error import HTTPError
import fasttext
import fasttext.util
import emoji
import pickle
import ast
import numpy as np

### Vocabulary

In [5]:
## Compiling the vocabulary

path = './CleanedFiles/'

dataFiles = os.listdir(path)

completeFrame = pd.read_csv(path + dataFiles[0])

for dataFile in dataFiles[1:]:
    
    
    file = pd.read_csv(path + dataFile, lineterminator='\n')
    
    completeFrame = pd.concat([completeFrame,file])

In [6]:
completeFrame

Unnamed: 0,ID,TITLE,THUMBNAIL1,THUMBNAIL2,THUMBNAIL3
0,nrMJkAO2ShA,i followed viral asian makeup transformations,https://i.ytimg.com/vi/nrMJkAO2ShA/default.jpg,https://i.ytimg.com/vi/nrMJkAO2ShA/mqdefault.jpg,https://i.ytimg.com/vi/nrMJkAO2ShA/hqdefault.jpg
1,xYsM7JM3Nx8,9 surprising uses for vaseline eng sub dingo k...,https://i.ytimg.com/vi/xYsM7JM3Nx8/default.jpg,https://i.ytimg.com/vi/xYsM7JM3Nx8/mqdefault.jpg,https://i.ytimg.com/vi/xYsM7JM3Nx8/hqdefault.jpg
2,61mvJNs0BLw,how to remove dark circles under eyes in the m...,https://i.ytimg.com/vi/61mvJNs0BLw/default.jpg,https://i.ytimg.com/vi/61mvJNs0BLw/mqdefault.jpg,https://i.ytimg.com/vi/61mvJNs0BLw/hqdefault.jpg
3,hfuMLfVK9fU,a perfect facial cleansing secret for daily sk...,https://i.ytimg.com/vi/hfuMLfVK9fU/default.jpg,https://i.ytimg.com/vi/hfuMLfVK9fU/mqdefault.jpg,https://i.ytimg.com/vi/hfuMLfVK9fU/hqdefault.jpg
4,yArOyxasEBI,9 beauty habits you should be doing every night,https://i.ytimg.com/vi/yArOyxasEBI/default.jpg,https://i.ytimg.com/vi/yArOyxasEBI/mqdefault.jpg,https://i.ytimg.com/vi/yArOyxasEBI/hqdefault.jpg
...,...,...,...,...,...
11225,3LAbVMetuO0,earcleaning asmr,https://i.ytimg.com/vi/3LAbVMetuO0/default.jpg,https://i.ytimg.com/vi/3LAbVMetuO0/mqdefault.jpg,https://i.ytimg.com/vi/3LAbVMetuO0/hqdefault.jpg
11226,lvVyCa2U16c,m,https://i.ytimg.com/vi/lvVyCa2U16c/default.jpg,https://i.ytimg.com/vi/lvVyCa2U16c/mqdefault.jpg,https://i.ytimg.com/vi/lvVyCa2U16c/hqdefault.jpg
11227,zu42IgM2r2U,binaural whispering japanese yandere,https://i.ytimg.com/vi/zu42IgM2r2U/default.jpg,https://i.ytimg.com/vi/zu42IgM2r2U/mqdefault.jpg,https://i.ytimg.com/vi/zu42IgM2r2U/hqdefault.jpg
11228,9-6pGgxiGIk,asmr binaural whispering japanese yandere,https://i.ytimg.com/vi/9-6pGgxiGIk/default.jpg,https://i.ytimg.com/vi/9-6pGgxiGIk/mqdefault.jpg,https://i.ytimg.com/vi/9-6pGgxiGIk/hqdefault.jpg


In [7]:
vocabulary = set(completeFrame.TITLE.str.cat(sep=' ').split())

In [8]:
len(vocabulary)

165044

### FastText

In [None]:
# Downloading pre trained word vectorization model

fasttext.util.download_model('en', if_exists='ignore')

In [None]:
# shrinking the model to different output sizes
ft = fasttext.load_model('cc.en.300.bin')
ft.get_dimension()
fasttext.util.reduce_model(ft, 10)

ft.get_dimension()
ft.save_model('cc.en.10.bin')

### Look-up Dictionaries

In [2]:
# List of sizes of available models

sizeList = ['10','25','50','100','200','300']


for size in sizeList:
    
    print('Compiling lookUpDict for size {}'.format(size))

    # load model with ouput dimension 'size'
    ft = fasttext.load_model('cc.en.' + size + '.bin')

    lookUpDict = {}
    
    # adding entry for every word
    
    for word in vocabulary:
        lookUpDict[word] = ft[word]
    
    
    # saving object as pickle
    with open('lookUpDict' + size + '.pickle', 'wb') as handle:
        pickle.dump(lookUpDict, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Implementation into Data Frame

In [45]:
# Helper function to convert a string into a list of word vectors

def string_to_vector(input_string, lookUp):
    
    output = []
    
    for word in input_string.split(' '):
        output.append(list(lookUpDict[word]))
        
    return output
        


In [46]:
test_text ='hello it is me mario'

lookUpDict = pickle.load(open( "lookUpDict25.pickle", "rb" ))

string_to_vector(test_text, lookUpDict)

[[-0.03779324,
  -0.07486235,
  0.005360998,
  -0.18865173,
  0.23182352,
  -0.0902558,
  -0.018535689,
  -0.03599851,
  -0.21849203,
  -0.037454333,
  -0.13344625,
  -0.014666298,
  -0.12897098,
  -0.05318767,
  -0.09672536,
  0.06529641,
  0.036780432,
  0.03782449,
  -0.01793421,
  -0.06356008,
  -0.03461542,
  0.045458958,
  -0.13217598,
  -0.06520218,
  -7.233268e-05],
 [-0.2919451,
  -0.78345275,
  0.0044529587,
  -0.14028956,
  0.6643456,
  0.30051634,
  -0.06340591,
  -0.046540357,
  -0.23588878,
  0.0723621,
  0.31028482,
  0.11243766,
  0.3281979,
  -0.01543403,
  -0.22710066,
  0.18688762,
  0.2948261,
  -0.21394022,
  -0.0076283664,
  0.3535992,
  0.039062984,
  0.27028435,
  -0.062106416,
  0.15874149,
  -0.11981203],
 [-0.24792731,
  -0.62126637,
  0.081501365,
  -0.03128113,
  0.4677232,
  0.59168303,
  -0.17371505,
  -0.049545746,
  0.10005777,
  -0.15775709,
  -0.030035932,
  -0.06509413,
  0.5573439,
  -0.16895488,
  -0.021357533,
  -0.034889825,
  0.13091314,
  -0.03

In [52]:
path = './CleanedFiles/'
outPath = './VectorizedFiles/'

dataFiles = os.listdir(path)

lookUpDict = pickle.load(open( "lookUpDict25.pickle", "rb" ))

for dataFile in dataFiles:
    
    print('Working on {}'.format(dataFile))
    
    
    dataSet = pd.read_csv(path + dataFile, lineterminator='\n')
    
    dataSet['VECTOR'] = dataSet['TITLE'].apply(string_to_vector, lookUp = lookUpDict)
    
    dataSet.to_csv(outPath + dataFile, index=False, sep=",")
    
    
    
dataSet

Working on subFrame14.csv
Working on subFrame24.csv
Working on subFrame37.csv
Working on subFrame33.csv
Working on subFrame15.csv
Working on subFrame5.csv
Working on subFrame40.csv
Working on subFrame4.csv
Working on subFrame27.csv
Working on subFrame26.csv
Working on subFrame1.csv
Working on subFrame3.csv
Working on subFrame10.csv
Working on subFrame38.csv
Working on subFrame35.csv
Working on subFrame19.csv
Working on subFrame29.csv
Working on subFrame39.csv
Working on subFrame25.csv
Working on subFrame7.csv
Working on subFrame12.csv
Working on subFrame23.csv
Working on subFrame31.csv
Working on subFrame30.csv
Working on subFrame21.csv
Working on subFrame2.csv
Working on subFrame13.csv
Working on subFrame18.csv
Working on subFrame32.csv
Working on subFrame9.csv
Working on subFrame17.csv
Working on subFrame6.csv
Working on subFrame8.csv
Working on subFrame34.csv
Working on subFrame20.csv
Working on subFrame28.csv
Working on subFrame16.csv
Working on subFrame22.csv
Working on subFrame36

Unnamed: 0,ID,TITLE,THUMBNAIL1,THUMBNAIL2,THUMBNAIL3,VECTOR
0,Ndm-HOTw3so,univz vanished official music video,https://i.ytimg.com/vi/Ndm-HOTw3so/default.jpg,https://i.ytimg.com/vi/Ndm-HOTw3so/mqdefault.jpg,https://i.ytimg.com/vi/Ndm-HOTw3so/hqdefault.jpg,"[[0.031767488, -0.07131766, 0.008343168, 0.147..."
1,vebkiUSh4Yw,monica code red audio ft missy elliott laiyah,https://i.ytimg.com/vi/vebkiUSh4Yw/default.jpg,https://i.ytimg.com/vi/vebkiUSh4Yw/mqdefault.jpg,https://i.ytimg.com/vi/vebkiUSh4Yw/hqdefault.jpg,"[[-0.1563951, -0.10123348, -0.07428697, -0.100..."
2,d_5lehm4svk,deniz koyu sonic official music video,https://i.ytimg.com/vi/d_5lehm4svk/default.jpg,https://i.ytimg.com/vi/d_5lehm4svk/mqdefault.jpg,https://i.ytimg.com/vi/d_5lehm4svk/hqdefault.jpg,"[[-0.46667317, -0.27574146, -0.106953844, -0.0..."
3,6g4HXXHi2R4,one thing remains,https://i.ytimg.com/vi/6g4HXXHi2R4/default.jpg,https://i.ytimg.com/vi/6g4HXXHi2R4/mqdefault.jpg,https://i.ytimg.com/vi/6g4HXXHi2R4/hqdefault.jpg,"[[0.25287554, -0.12034936, -0.0600471, 0.04255..."
4,neJ0ngA-4Fk,big freedia crazy official video,https://i.ytimg.com/vi/neJ0ngA-4Fk/default.jpg,https://i.ytimg.com/vi/neJ0ngA-4Fk/mqdefault.jpg,https://i.ytimg.com/vi/neJ0ngA-4Fk/hqdefault.jpg,"[[0.2794941, -0.20855501, -0.071314484, -0.180..."
...,...,...,...,...,...,...
11225,3LAbVMetuO0,earcleaning asmr,https://i.ytimg.com/vi/3LAbVMetuO0/default.jpg,https://i.ytimg.com/vi/3LAbVMetuO0/mqdefault.jpg,https://i.ytimg.com/vi/3LAbVMetuO0/hqdefault.jpg,"[[0.06729414, 0.015090108, 0.06142246, 0.02410..."
11226,lvVyCa2U16c,m,https://i.ytimg.com/vi/lvVyCa2U16c/default.jpg,https://i.ytimg.com/vi/lvVyCa2U16c/mqdefault.jpg,https://i.ytimg.com/vi/lvVyCa2U16c/hqdefault.jpg,"[[-0.72247845, -0.743055, 0.017386906, -0.0280..."
11227,zu42IgM2r2U,binaural whispering japanese yandere,https://i.ytimg.com/vi/zu42IgM2r2U/default.jpg,https://i.ytimg.com/vi/zu42IgM2r2U/mqdefault.jpg,https://i.ytimg.com/vi/zu42IgM2r2U/hqdefault.jpg,"[[-0.13301763, -0.18186854, 0.034448333, -0.01..."
11228,9-6pGgxiGIk,asmr binaural whispering japanese yandere,https://i.ytimg.com/vi/9-6pGgxiGIk/default.jpg,https://i.ytimg.com/vi/9-6pGgxiGIk/mqdefault.jpg,https://i.ytimg.com/vi/9-6pGgxiGIk/hqdefault.jpg,"[[-0.6019181, -0.7110011, -0.06538458, -0.3970..."


In [53]:
dataSet['VECTOR'].iloc[0]

[[0.031767488,
  -0.07131766,
  0.008343168,
  0.14791736,
  0.08218664,
  0.06846708,
  -0.040099673,
  0.033847775,
  0.06376763,
  -0.029318934,
  -0.04242466,
  0.07221213,
  0.017584872,
  -0.0136618875,
  0.049476817,
  -0.015082754,
  0.09301549,
  0.03859654,
  0.04587706,
  0.02056676,
  0.06462568,
  0.051286142,
  -0.05278785,
  -0.041682966,
  -0.01696261],
 [0.077033505,
  -0.021707298,
  -0.114067875,
  0.00497555,
  0.044755418,
  0.030209098,
  0.088234514,
  -0.017883847,
  -0.0028355697,
  -0.027497943,
  0.022049364,
  0.03792788,
  -0.0055761314,
  0.02933413,
  -0.054901294,
  0.0010227845,
  -0.03427942,
  -0.034390233,
  0.06745093,
  -0.028564163,
  -0.043961477,
  0.08445689,
  0.030081386,
  0.040527653,
  0.047733255],
 [0.04116016,
  0.011667733,
  -0.018048443,
  0.05319949,
  -0.08577143,
  0.013721909,
  -0.09994365,
  -0.030533582,
  -0.14613664,
  -0.0021498986,
  0.009348285,
  0.011390201,
  0.02363362,
  -0.045482714,
  0.0081023965,
  -0.08520444,
 

In [4]:
# To read dataframe from csv

def from_np_array(array_string):
    #array_string = ','.join(array_string.replace('[ ', '[').split())
    return np.array(ast.literal_eval(array_string))



frame = pd.read_csv('./VectorizedFiles/subFrame14.csv', converters={'VECTOR': from_np_array})

In [13]:
arr = frame['VECTOR'][0].flatten()

np.pad(arr, (0, 25*64 - arr.size), 
       mode='constant', constant_values=0)




array([-0.9325552 , -1.3138075 ,  0.12836441, ...,  0.        ,
        0.        ,  0.        ])