### csv 파일을 읽어와서 json 파일로 변경합니다.
### 또한, 데이터셋을 학습 데이터와 평가 데이터로 분리합니다

In [30]:
# this notebook will read the csv file and convert it to json format
# it also divides the dataset into training and evaluation sets.

import csv
import json
import os
from collections import defaultdict
import ast
from tqdm import tqdm

## user_csv -> json file
csvfile = open('users.csv', 'r')
jsonfile = open('temp_users.json', 'w')

# 필드 이름을 tuple로 지정
fieldnames = ('age','bodyType','diet','drinks', 'drugs', 'education', 'essay0', 'essay1', 'essay2', 'essay3', 'essay4', 'essay5', 'essay6', 'essay7', 'essay8', 'essay9', 'ethnicity', 'height', 'income', 'job', 'lastOnline', 'location', 'offspring', 'orientation', 'pets', 'religion', 'sex', 'sign', 'smokes', 'speaks', 'status')
reader = csv.DictReader( csvfile, fieldnames)
for row in tqdm(reader):
    json.dump(row, jsonfile)
    jsonfile.write('\n')
    
jsonfile.close()
print('Done reading CSV file')

27749it [00:02, 12185.84it/s]

Done reading CSV file





In [31]:
# 앞서 json 파일로 변경한 파일을 training file과 validation파일로 변경합니다.
outfile_training = open('users_training.json', 'w')
outfile_evaluation = open('users_evaluation.json', 'w')

users_json = []
users_dict_training = defaultdict(dict)
users_dict_evaluation = defaultdict(dict)

essays = ['essay0', 'essay1', 'essay2', 'essay3', 'essay4', 'essay5', 'essay6', 'essay7', 'essay8', 'essay9']
user_id = -1
val_count = 0

#read json file
count = 0
for line in open('./temp_users.json', 'r'):
    u = ast.literal_eval(line) # this returns dictionary
    count += 1
    
    #remove fields we decide to ignore
    if 'location' in u: del u['location']     # location은 거리순으로 표현하지만 추천 알고리즘에는 포함 X
    if 'income' in u: del u['income']         # 학생을 대상으로 하기에 제외
    if 'offspring' in u: del u['offspring']   # 자식 수는 일반적이지 않은 평가값 
    if 'drugs' in u: del u['drugs']           # 약물 여부는 한국인을 대상으로 하기에 불필요
    if 'job' in u: del u['job']               # 직업은 모두 학생이므로 제외
    if 'lastOnline' in u: del u['lastOnline'] # 마지막 online도 반영하지 않음 
    
        
    
    #clean up essay questions
    for e in essays:
        u[e] = u[e].replace('<br />', '')
        u[e] = u[e].replace('\n', ' ')
    
    #add user to dictionary
    # 15000 for training, 
    if (user_id < 15000):
        users_dict_training[user_id]= u
    else:
        val_count += 1
        users_dict_evaluation[user_id]= u
    user_id += 1;

print("total number of dataset :", count)
print("training : {}, validation : {}".format(15000, val_count))


if '-1' in users_dict_training: del users_dict_training['-1']
if '27747' in users_dict_evaluation: del users_dict_evaluation['27747']

json.dump(users_dict_training, outfile_training)
json.dump(users_dict_evaluation, outfile_evaluation)

outfile_training.close()
outfile_evaluation.close()

os.remove('temp_users.json')
print('Removed dataset/temp_users.json')
print('Done Parsing. Data written to dataset/parsed_users.dict')


total number of dataset : 27749
training : 15000, validation : 12748
Removed dataset/temp_users.json
Done Parsing. Data written to dataset/parsed_users.dict


### parsed_user dictionary에 원하는 정보가 담겨 있음.

In [34]:
# to read the file to 
import ast
from pprint import pprint

infile = open('./users_training.json', 'r')
dictionary = ast.literal_eval(infile.readline())

new_dict = defaultdict(dict)
for key, value in dictionary.items():
    if 'straight' in value['orientation']:
        new_dict[key] = value
infile.close()

In [35]:
len(new_dict)

12849

In [36]:
new_dict

defaultdict(dict,
            {'0': {'age': '22',
              'bodyType': 'a little extra',
              'diet': 'strictly anything',
              'drinks': 'socially',
              'education': 'working on college/university',
              'essay0': "about me:  i would love to think that i was some some kind of intellectual: either the dumbest smart guy, or the smartest dumb guy. can't say i can tell the difference. i love to talk about ideas and concepts. i forge odd metaphors instead of reciting cliches. like the simularities between a friend of mine's house and an underwater salt mine. my favorite word is salt by the way (weird choice i know). to me most things in life are better as metaphors. i seek to make myself a little better everyday, in some productively lazy way. got tired of tying my shoes. considered hiring a five year old, but would probably have to tie both of our shoes... decided to only wear leather shoes dress shoes.  about you:  you love to have really serious

In [37]:
# to read the file to 
import ast
from pprint import pprint

infile = open('./users_evaluation.json', 'r')

dictionary = ast.literal_eval(infile.readline())
new_dict_eval = defaultdict(dict)

for key, value in dictionary.items():
    if 'straight' in value['orientation']:
        new_dict_eval[key] = value

infile.close()

In [38]:
len(new_dict_eval)

10702

In [39]:
new_dict_eval

defaultdict(dict,
            {'15000': {'age': '23',
              'bodyType': 'skinny',
              'diet': 'mostly anything',
              'drinks': 'socially',
              'education': 'working on college/university',
              'essay0': "despite what i've been led to believe, meeting people is <em>not</em> easy. since i'm pretty interested in meeting people, i thought that this might be a smart idea.",
              'essay1': 'going to school, working, and kicking ass at life.',
              'essay2': 'a multitude of vastly unrelated, useless things and a handful of useful, practical things.',
              'essay3': "i don't know. my face?",
              'essay4': 'haruki murakami, david foster wallace (i know, i know), herman melville, j. k. rowling (you know the one), gabriel garcia marquez, jorge luis borges, dave eggers  the wire, six feet under, breaking bad, arrested development, community, trigun, death note  2pac, common, jay-z, talib kweli, mos def, black star

In [40]:
outfile_training = open('users_training_straight.json', 'w')
outfile_evaluation = open('users_evaluation_straight.json', 'w')

json.dump(new_dict, outfile_training)
json.dump(new_dict_eval, outfile_evaluation)

outfile_training.close()
outfile_evaluation.close()