# Seminar session Do. 28.11.2024
##### Andreas Wagner



# Structure:

1. Parse data from .json into .csv file
2. Data-Structure
3. plotting basics


In [4]:
import pandas as pd
import plotly as py
import plotly.express as px
import json
import os

## 1. Parsing data from .json into .csv file
### Basic pandas parsing

Run the following line of code and examine what went wrong.

!Hint: NaN equals the phrase Not a Number and indicates missing data.

In [5]:
# Load data from wue02.json
with open('wue02.json') as json_file:
    data = json.load(json_file)

# Convert the JSON data to a pandas dataframe
df = pd.DataFrame(data)

# Display the dataframe
print(df)

                                                             P07  P09  P11  \
1718266733014  {'location': {'locationID': '1718266928666', '...  NaN  NaN   
1718267618450  {'location': {'locationID': '1718267853810', '...  NaN  NaN   
1718268467154  {'tags': [{'tag': 'Ereignisreich', 'tagType': ...  NaN  NaN   
1718269293446  {'location': {'locationID': '1718269488413', '...  NaN  NaN   
1718270040934  {'tags': [{'tag': 'Ereignisreich', 'tagType': ...  NaN  NaN   
...                                                          ...  ...  ...   
1718273026070                                                NaN  NaN  NaN   
1718275564692                                                NaN  NaN  NaN   
1718277122119                                                NaN  NaN  NaN   
1718277918503                                                NaN  NaN  NaN   
1718278344068                                                NaN  NaN  NaN   

               P19  p01  p02  p03  p04  p05  p06  p13  p14  p15

### Better way to work with this json data?

Have a quick think before looking at the following line of code:

- what datatypes do you know that could be worked with?

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

In [7]:
json_dict = {}

# creating a simple dictionary in the form of the json data: 

json_dict['P01'] = {
    'key01': 'value01',
    'key02': {
        'subKey0201': 'subValue0201',
    },
    "key03": [1, 2, 3]
}

# then we can refere to the different keys and values in the dictionary:

print(json_dict['P01']['key02']['subKey0201'])

subValue0201


### JSON and Dictionaries

- We begin again with loading the json file into a dictionary

In [24]:
with open("wue02.json", "r") as json_file:
    data = json.load(json_file)
    
data

{'P07': {'1718266733014': {'location': {'locationID': '1718266928666',
    'lat': '49.800366',
    'lon': '9.9323516'},
   'tags': [{'tag': 'Eventful',
     'tagType': 'number',
     'tagLabel2': 'Uneventful',
     'tagMin': ' 1',
     'tagMax': ' 5',
     'tagStep': ' 1',
     'tagValue': ' 1.0'},
    {'tag': 'Exciting',
     'tagType': 'number',
     'tagLabel2': 'Monotonous',
     'tagMin': ' 1',
     'tagMax': ' 5',
     'tagStep': ' 1',
     'tagValue': ' 5.0'},
    {'tag': 'Pleasant',
     'tagType': 'number',
     'tagLabel2': 'Unpleasant',
     'tagMin': ' 1',
     'tagMax': ' 5',
     'tagStep': ' 1',
     'tagValue': ' 5.0'},
    {'tag': 'Calm',
     'tagType': 'number',
     'tagLabel2': 'Chaotic',
     'tagMin': ' 1',
     'tagMax': ' 5',
     'tagStep': ' 1',
     'tagValue': ' 5.0'},
    {'tag': 'Anxious',
     'tagType': 'number',
     'tagLabel2': 'Safe',
     'tagMin': ' 1',
     'tagMax': ' 5',
     'tagStep': ' 1',
     'tagValue': ' 1.0'},
    {'tag': 'Stressed',
  

### Parse into Dataframe

- for easy use we have to parse it into a dataframe
- the structure of the dataframe has to be as given

[personID, observationID, lat, lon, mediaPath, tagValue, tagLabel1, tagLabel2]

- some additional columns for advanced use will be added


In [77]:
df = pd.DataFrame(columns=['id','observationID','personID','lat','lon','media',])

# just import the helper json; config questionaire
with open("config_questionaire.json", "r") as config_json:
    config = json.load(config_json)

for person in data:
    for observation in data[person]:
        for media in data[person][observation]['media']:
            # we add the media to the dataframe
            id = len(df)
            df.loc[id, 'id'] = id
            df.loc[id, 'observationID'] = observation
            df.loc[id, 'personID'] = person
            df.loc[id, 'lat'] = data[person][observation]['location']['lat']
            df.loc[id, 'lon'] = data[person][observation]['location']['lon']
            df.loc[id, 'media'] = media['mediaID']
            # we add the tags to the dataframe
            for tag in data[person][observation]['tags']:
                # if there is already a column for this specific tag, we add the value to the column. For identifying the tag we use the "tag" and the "tagLabel2" values. if they are in german, we map them to their english counterparts
                tag_column = f'{tag["tag"]}_{tag["tagLabel2"]}'
                for item in config['questionaire']:
                    if item['label1']['de'] == tag['tag'] and item['label2']['de'] == tag['tagLabel2']:
                        tag_column = f'{item["label1"]["en"]}_{item["label2"]["en"]}'
                if tag_column not in df.columns:
                    df[tag_column] = None
                df.loc[id, tag_column] = int(float(tag['tagValue']))
df

Unnamed: 0,id,observationID,personID,lat,lon,media,Eventful_Uneventful,Exciting_Monotonous,Pleasant_Unpleasant,Calm_Chaotic,...,Depressed_Cheerful,Detached_Attached,Exhausted_Excited,uncrowded _crowded,accessible_unaccessible,central_remote,green _built-up,clean_polluted,quiet_busy,relaxing_disturbing
0,0,1718266733014,P07,49.800366,9.9323516,P07/1718266733014/media/1718266733009.aac,1,5,5,5,...,2,1,1,2,5,3,2,5,4,5
1,1,1718266733014,P07,49.800366,9.9323516,P07/1718266733014/media/1718266776456.aac,1,5,5,5,...,2,1,1,2,5,3,2,5,4,5
2,2,1718266733014,P07,49.800366,9.9323516,P07/1718266733014/media/1718266884668.aac,1,5,5,5,...,2,1,1,2,5,3,2,5,4,5
3,3,1718266733014,P07,49.800366,9.9323516,P07/1718266733014/media/1718266905336.jpg,1,5,5,5,...,2,1,1,2,5,3,2,5,4,5
4,4,1718266733014,P07,49.800366,9.9323516,P07/1718266733014/media/1718266913377.jpg,1,5,5,5,...,2,1,1,2,5,3,2,5,4,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
456,456,1718278344068,p18,49.796784,9.9355711,p18/1718278344068/media/1718278462140.aac,2,2,2,4,...,4,4,4,3,3,3,5,4,4,4
457,457,1718278344068,p18,49.796784,9.9355711,p18/1718278344068/media/1718278551889.aac,2,2,2,4,...,4,4,4,3,3,3,5,4,4,4
458,458,1718278344068,p18,49.796784,9.9355711,p18/1718278344068/media/1718278570887.jpg,2,2,2,4,...,4,4,4,3,3,3,5,4,4,4
459,459,1718278344068,p18,49.796784,9.9355711,p18/1718278344068/media/1718278582950.jpg,2,2,2,4,...,4,4,4,3,3,3,5,4,4,4


### Adding important infromation we can retrieve from the ID and the media Path

- timestamp from observationID
- type from file ending of mediaPath

In [78]:
df['mediaType'] = df['media'].apply(lambda x: "Image" if x.endswith('.jpg') else "Audio")

In [79]:
import datetime
# the datetime is in milliseconds from epoch 
df['timestamp'] = df['observationID'].apply(lambda x: datetime.datetime.fromtimestamp(int(x)/1000))

In [80]:
df.to_csv('data.csv', index=False)

## Adding transcriptions to the audio files (If provided)

In [81]:
df['transcription'] = None
for element in os.listdir("transcriptionresults_p"):
    if element.endswith('.txt'):
        fileName = element.split('.')[0]
        with open(f'transcriptionresults_p/{element}', 'r') as file:
            transcription = file.read()
        for index, row in df.iterrows():
            if row['media'].split('/')[-1].split('.')[0] == fileName:
                df.loc[index, 'transcription'] = transcription
df


Unnamed: 0,id,observationID,personID,lat,lon,media,Eventful_Uneventful,Exciting_Monotonous,Pleasant_Unpleasant,Calm_Chaotic,...,uncrowded _crowded,accessible_unaccessible,central_remote,green _built-up,clean_polluted,quiet_busy,relaxing_disturbing,mediaType,timestamp,transcription
0,0,1718266733014,P07,49.800366,9.9323516,P07/1718266733014/media/1718266733009.aac,1,5,5,5,...,2,5,3,2,5,4,5,Audio,2024-06-13 10:18:53.014,"Ja gut, okay, nochmal neu. Also ich fühle mic..."
1,1,1718266733014,P07,49.800366,9.9323516,P07/1718266733014/media/1718266776456.aac,1,5,5,5,...,2,5,3,2,5,4,5,Audio,2024-06-13 10:18:53.014,"Ja gut, okay, nochmal neu. Also ich fühle mic..."
2,2,1718266733014,P07,49.800366,9.9323516,P07/1718266733014/media/1718266884668.aac,1,5,5,5,...,2,5,3,2,5,4,5,Audio,2024-06-13 10:18:53.014,"Ja gut, okay, nochmal neu. Also ich fühle mic..."
3,3,1718266733014,P07,49.800366,9.9323516,P07/1718266733014/media/1718266905336.jpg,1,5,5,5,...,2,5,3,2,5,4,5,Image,2024-06-13 10:18:53.014,
4,4,1718266733014,P07,49.800366,9.9323516,P07/1718266733014/media/1718266913377.jpg,1,5,5,5,...,2,5,3,2,5,4,5,Image,2024-06-13 10:18:53.014,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
456,456,1718278344068,p18,49.796784,9.9355711,p18/1718278344068/media/1718278462140.aac,2,2,2,4,...,3,3,3,5,4,4,4,Audio,2024-06-13 13:32:24.068,"Okay, I am currently walking around the Tross..."
457,457,1718278344068,p18,49.796784,9.9355711,p18/1718278344068/media/1718278551889.aac,2,2,2,4,...,3,3,3,5,4,4,4,Audio,2024-06-13 13:32:24.068,
458,458,1718278344068,p18,49.796784,9.9355711,p18/1718278344068/media/1718278570887.jpg,2,2,2,4,...,3,3,3,5,4,4,4,Image,2024-06-13 13:32:24.068,
459,459,1718278344068,p18,49.796784,9.9355711,p18/1718278344068/media/1718278582950.jpg,2,2,2,4,...,3,3,3,5,4,4,4,Image,2024-06-13 13:32:24.068,


In [87]:
df.to_csv('data.csv', index=False)

## Basic Plots of the dataframe

### single point one tag value on a scale from 1 to 5 (Likert basic)

In [88]:
fig = px.scatter(df, x='Eventful_Uneventful', y="personID", color='observationID')

fig.show()