# Clean-up

In this Notebook the Scraped data is cleaned up, this step is done after collecting the pictures.

First the instances that did not include a valid thumbnail link are removed (missing.txt in each subset folder)

Second the title text is cleaned and standardized to make it ready for vectorization.

In [14]:
import pandas as pd
import os
import urllib.request
import time
from urllib.error import HTTPError
import fasttext
import fasttext.util
import emoji
import pickle

In [2]:
## remove instances with missing 

dataPath = './DataFiles/'
imagePath = './Images/'
cleanPath = './CleanedFiles/'

fileNames = os.listdir(dataPath)

for fileName in fileNames:
    
    dataFile = pd.read_csv(dataPath + fileName, lineterminator='\n')
    missingList = pd.read_csv(imagePath + fileName.replace('.csv','/') + 'missing.txt', lineterminator='\n',header=None)[0]
    noMissing = dataFile[~dataFile['ID'].isin(missingList)]
    noMissing.to_csv(cleanPath + fileName, index=False, sep=",")


In [3]:
# Remove Private and Missing Videos

fileNames = os.listdir(cleanPath)

titleBlacklist = ['Private video', 'Deleted video']

for fileName in fileNames:
         
    
    dataFile = pd.read_csv(cleanPath + fileName, lineterminator='\n')
    noMissing = dataFile[~dataFile['TITLE'].isin(titleBlacklist)]
    noMissing.to_csv(cleanPath + fileName, index=False, sep=",")

In [4]:
# Clean up strings to alphanumeric characters only

fileNames = os.listdir(cleanPath)

for fileName in fileNames:
    # read one file
    dataFile = pd.read_csv(cleanPath + fileName, lineterminator='\n')
    
    # replace emojis with their text equivalent
    
    dataFile['TITLE'] = dataFile['TITLE'].apply(emoji.demojize, delimiters=(" ", " "))
    
    #all strings to lowercase
    dataFile['TITLE'] = dataFile['TITLE'].str.lower()
    
    #remove everything except a-z 0-9 and ' '.
    dataFile['TITLE'] = dataFile['TITLE'].replace('[^a-zA-Z0-9 ]', ' ', regex=True)
    
    #remove reduce consecutive spaces to a single space
    dataFile['TITLE'] = dataFile['TITLE'].replace(' +', ' ', regex=True)
    
    #remove leading space
    dataFile['TITLE'] = dataFile['TITLE'].replace('^ ', '', regex=True)
    
    #remove leading space
    dataFile['TITLE'] = dataFile['TITLE'].replace(' $', '', regex=True)
    
    #remove all rows with no title
    dataFile = dataFile[dataFile['TITLE'] != '']
    
    #save the data frame
    dataFile.to_csv(cleanPath + fileName, index=False, sep=",")

In [5]:
# By checking the maximum number of white spaces in title in each 
# data set we can count how many words the the largest title includes

fileNames = os.listdir(cleanPath)
count = 1

for fileName in fileNames:
    dataFile = pd.read_csv(cleanPath + fileName, lineterminator='\n')

    print(dataFile['TITLE'].str.count(' ').max())
    
# the maximum number of whitespaces is 63 so the title with the most amount of words has 64 words

27
28
31
30
28
33
31
35
44
45
41
36
33
28
36
36
22
45
29
37
35
31
38
63
37
36
52
34
43
29
27
41
38
38
38
37
31
28
31
22
