Converting suplicate CSV to json

In [1]:
import pandas as pd
import json

In [2]:
df = pd.read_csv("data/class_meme_.csv",sep=',')

In [3]:
df = df.drop(columns=['S.No'])

In [4]:
df = df.fillna(0)

In [5]:
duplicate_ground_truth = {}

In [6]:
def row_to_json(row):
    row_list = row.tolist()
    key = row_list.pop(0).lower()
    row_list = [x.lower() for x in row_list if x!= 0]
    if key not in duplicate_ground_truth.keys():
        duplicate_ground_truth[key]=row_list
    else:
        print("something is wrong")
        print(key)

In [7]:
%%capture
df.apply(row_to_json,axis=1)

In [8]:
with open("data/duplicate_ground_truth.json",'w') as out:
    json.dump(duplicate_ground_truth,out,indent=4) 

In [9]:
len(duplicate_ground_truth.keys())

370

Name to number mapping json

In [10]:
df =  pd.read_csv("data/title_description.csv",sep=',')

In [11]:
df = df.drop(columns=['title','description'])

In [12]:
title_to_number_map = {}

In [13]:
def row_to_map(row):
    key = row['link'].replace('/','').lower()
    value = row['image_name']
    if key not in title_to_number_map.keys():
        title_to_number_map[key] = value
    else:
        print("something is wrong")

In [14]:
%%capture
df.apply(row_to_map,axis=1)

In [15]:
title_to_number_map

{'y-u-no': 0,
 'willy-wonka': 1,
 'the-most-interesting-man-in-the-world': 2,
 'futurama-fry': 3,
 'success-kid': 4,
 'one-does-not-simply': 5,
 'bad-luck-brian': 6,
 'first-world-problems': 7,
 'philosoraptor': 8,
 'grumpy-cat': 9,
 'winter-is-coming': 10,
 'forever-alone': 11,
 'good-guy-greg': 12,
 'scumbag-steve': 13,
 'what-if-i-told-you': 14,
 'conspiracy-keanu': 15,
 'kermit-the-frog-drinking-tea': 16,
 'yo-dawg': 17,
 'all-the-things': 18,
 'insanity-wolf': 19,
 'joseph-ducreux': 20,
 'pedobear': 21,
 'trollface': 22,
 'skeptical-3rd-world-kid': 23,
 'annoying-facebook-girl': 24,
 'disaster-girl': 25,
 'socially-awkward-penguin': 26,
 'prepare-yourself': 27,
 'slowpoke': 28,
 'dr-evil-meme': 29,
 'advice-yoda-gives': 30,
 'joker-mind-loss': 31,
 'stoner-stanley': 32,
 'pleaseguy': 33,
 'foul-bachelor-frog': 34,
 'batman-slap-robin': 35,
 'high-expectations-asian-father': 36,
 'koala-cant-believe-it': 37,
 'mr-bean': 38,
 'chuck-norris': 39,
 'overly-attached-girlfriend': 40,
 '

In [16]:
with open("data/title_to_number_map.json",'w') as out:
    json.dump(title_to_number_map,out,indent=4) 

Classification data

In [17]:
len(duplicate_ground_truth.keys())

370

In [18]:
def classification_data_list():
    data = []
    error = []
    for key in duplicate_ground_truth.keys():
        try:
            number = title_to_number_map[key]
            with open(f'data/captions/{number}.txt','r') as captions_file:
                captions = captions_file.readlines()
            data.extend([(x,key) for x in captions])
        except:
            error.append(key)
    return data,error

In [19]:
data,error = classification_data_list()

In [20]:
classification_data = pd.DataFrame(data,columns=["caption","meme_template"])

In [21]:
classification_data

Unnamed: 0,caption,meme_template
0,I Want To Use The internet But My Mom is on th...,90s-problems
1,march 1994 justin bieber is born\n,90s-problems
2,"SPENT 20 BUCKS TAKING MY FRIENDS TO NEW ""TITAN...",90s-problems
3,memes haven't been invented yet \n,90s-problems
4,i want to play gameboy but mom made me turn o...,90s-problems
5,want to burn new metallica cd napster is down\n,90s-problems
6,"rented good video game 3 days later, now must ...",90s-problems
7,my C+C Music Factory cd won't play because it'...,90s-problems
8,blowing on cartridge didn't work\n,90s-problems
9,public bus hit a speed bump my cd player skipp...,90s-problems


In [22]:
classification_data.to_csv("data/classification_data.csv")