In [1]:
import pandas as pd
import ast

from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
df = pd.read_csv('fixed.csv')

In [8]:
def getSimpleColumns(df, columns):
    simple_columns = list(df.columns)
    for key in columns:
        simple_columns = filter(lambda simple_column: (simple_column not in columns[key]), simple_columns)
        simple_columns = list(simple_columns)
    return simple_columns

In [10]:
columns = {}
columns['floats'] = ['O', 'C', 'E', 'A', 'N']
columns['data'] = ['user_id', 'survey_id', 'person', 'personality_score', 'diagnosis', 'main_disease', 'Id']
columns['lists'] = ['s2.q6', 's3.q16', 'S9.q9', 's3.q10_1', 's3.q6_1'] 
columns['ailment_maps'] = ['ailment_maps.face_profile', 'ailment_maps.body_back', 'ailment_maps.body_front', 'ailment_maps.face_front']
columns['Diagnosis'] = ['Diagnosis']
columns['dates'] = ['s2.q4_1_1', 's2.q4_2_1']
columns['simple_columns'] = getSimpleColumns(df, columns)

In [30]:
def convertDFWithStringOfListToDFWithList(df):
    for column in df.columns:
        df[column] = getListFromStringOfList(list(df[column]))
    return df

In [31]:
def getListFromStringOfList(array):
    listOfList = []
    for stringOfList in array:
        listOfList.append(ast.literal_eval(stringOfList))
    return listOfList

In [32]:
df_list = df[columns['lists']]
df_list = df_list.fillna('[]')
df_list = convertDFWithStringOfListToDFWithList(df_list)

df_simple = df[columns['simple_columns']]
df_simple = df_simple.fillna(-1)

In [29]:
df_simple['s3.q6'].loc[65] = '0'
df_simple['s3.q6'] = pd.to_numeric(df_simple['s3.q6'])

In [13]:
def oneHotDfWithListInColumns(df):
    mlb = MultiLabelBinarizer()
    columns = df.columns
    
    for column in columns:
        df = df.join(
            pd.DataFrame(
                mlb.fit_transform(df.pop(column)), 
                columns=mlb.classes_, 
                index=df.index))
        
    return df

In [14]:
df_list = oneHotDfWithListInColumns(df_list)

In [15]:
df_maps = df[columns['ailment_maps']]
df_maps = df_maps.fillna('[]')

In [16]:
def listOfValueWithoutScale(stringOfDict):
    listOfDict = ast.literal_eval(stringOfDict)
    listOfValue = []
    for feeling in listOfDict:
        valueWithoutScale = ''
        for key, value in feeling.items():
            if key != 'scale':
                valueWithoutScale += '.' + value
        listOfValue.append(valueWithoutScale[1:])
    return listOfValue

In [17]:
def getListFromStringOfDict(array):
    listOfList = []
    for stringOfDict in array:
        listOfValue = listOfValueWithoutScale(stringOfDict)
        listOfList.append(listOfValue)
    return listOfList

In [18]:
def dictOfValueWithScale(stringOfDict, dictToExpand):
    listOfDict = ast.literal_eval(stringOfDict)
    for feeling in listOfDict:
        valueWithoutScale = ''
        for key, value in feeling.items():
            if key != 'scale':
                valueWithoutScale += '.' + value
            else:
                dictToExpand[valueWithoutScale[1:]] = str(value)
    return dictToExpand

In [19]:
def getDictFromStringOfDictWithScale(array):
    dictOfList = {}
    for stringOfDict in array:
        dictOfList = dictOfValueWithScale(stringOfDict, dictOfList)
    return dictOfList

In [20]:
def transformToListWithoutScale(df):
    for column in df.columns:
        df[column] = getListFromStringOfDict(list(df[column]))
    return df

In [21]:
df_maps_without_scale = transformToListWithoutScale(df_maps.copy())

In [22]:
import numpy as np
unique_values = []
for column in df_maps_without_scale.columns:
    listOfList = []
    for listOfValue in df_maps_without_scale[column]:
        for value in listOfValue:
            listOfList.append(value)
    unique_values.append(list(set(listOfList)))
unique_values = list(np.concatenate(unique_values).flat)

In [23]:
df_ailment_maps = pd.DataFrame(
    data = np.zeros((df.shape[0], len(unique_values))), 
    index = df.index, 
    columns = unique_values
)

In [24]:
translate_values = {
    'small': 1,
    'medium': 2,
    'big': 3,
    'None': 1
}

In [25]:
for row in df_maps.iterrows():
    idx = row[0]
    dictOfFillings = getDictFromStringOfDictWithScale(row[1])
    for key, value in dictOfFillings.items():
        df_ailment_maps.loc[idx][key] = translate_values[value]

In [26]:
frames = [df_simple, df_list, df_ailment_maps, df[columns['floats']]]
df_finally = pd.concat(frames, axis=1)

In [27]:
df_finally = df_finally.astype(np.float64)

In [28]:
frames2 = [df_finally, df[columns['Diagnosis']]]
df_finally = pd.concat(frames2, axis=1)

In [33]:
df_finally.to_csv('df_finally.csv', index=True)  