# View all available datasets

In [1]:
import os

print("List of all available datasets:")
for entry in os.scandir('./data'):
    if entry.is_file():
        print("-- " + entry.name)

List of all available datasets:
-- demographic.csv
-- cleaned_hm.csv
-- mergedData.csv


---
# Import necessary libraries

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.max_colwidth = 1000

---
## Demographic dataset explorations

In [4]:
data_demo = pd.read_csv("data/demographic.csv")

In [5]:
data_demo.shape

(10844, 6)

In [6]:
data_demo.columns

Index(['wid', 'age', 'country', 'gender', 'marital', 'parenthood'], dtype='object')

In [7]:
data_demo.sample(5)

Unnamed: 0,wid,age,country,gender,marital,parenthood
5044,5569,44.0,USA,f,separated,n
389,390,31.0,USA,m,single,n
6350,7409,41.0,USA,f,single,n
9572,12008,35.0,USA,f,married,y
3869,4087,30.0,USA,m,single,n


In [8]:
print("Number of unique countries:", data_demo['country'].unique().shape[0])
print("which are:", data_demo['country'].unique())

Number of unique countries: 101
which are: ['USA' 'IND' 'VNM' 'THA' 'GBR' 'SRB' 'BGR' 'BRA' 'BGD' 'CAN' 'PRT' 'PHL'
 'VEN' 'MKD' 'DOM' 'DNK' 'IDN' 'AUS' 'URY' 'JAM' 'EGY' 'DEU' 'KEN' 'MDA'
 'IRL' 'NGA' 'ALB' 'MEX' 'TTO' 'KWT' 'RUS' 'LTU' 'PRI' 'SWE' 'PER' 'PAK'
 'TUR' 'FRA' 'ARE' 'COL' 'GRC' 'ROU' 'ARM' 'MAC' 'SGP' 'UGA' 'POL' 'NIC'
 'JPN' 'ESP' 'TWN' 'FIN' 'ITA' 'GHA' 'AUT' 'NPL' 'NZL' 'NLD' 'KOR' 'LKA'
 'KNA' 'ZAF' 'BEL' 'ISL' 'DZA' 'AFG' 'GMB' nan 'ASM' 'MYS' 'CZE' 'MLT'
 'SVN' 'UMI' 'ARG' 'SAU' 'EST' 'ECU' 'ZMB' 'CHL' 'MAR' 'ISR' 'LVA' 'KAZ'
 'NOR' 'CRI' 'BRB' 'HRV' 'ETH' 'TCA' 'BHS' 'VIR' 'GTM' 'IRQ' 'HKG' 'MUS'
 'TUN' 'SUR' 'SLV' 'CYP' 'UKR']


In [9]:
print("Number of unique marital status:", data_demo['marital'].unique().shape[0])
print("which are:", data_demo['marital'].unique())

Number of unique marital status: 6
which are: ['married' 'single' 'divorced' 'separated' 'widowed' nan]


---
## Cleaned dataset explorations

In [10]:
data_happy = pd.read_csv("data/cleaned_hm.csv")

In [11]:
data_happy.shape

(100535, 9)

In [12]:
data_happy.columns

Index(['hmid', 'wid', 'reflection_period', 'original_hm', 'cleaned_hm',
       'modified', 'num_sentence', 'ground_truth_category',
       'predicted_category'],
      dtype='object')

In [13]:
data_happy.sample(5)

Unnamed: 0,hmid,wid,reflection_period,original_hm,cleaned_hm,modified,num_sentence,ground_truth_category,predicted_category
74501,102585,4352,3m,"At work today, my boss told me I was doing a great job.","At work today, my boss told me I was doing a great job.",True,1,,achievement
32459,60310,11323,3m,My niece was born on March 2nd.,My niece was born on March 2nd.,True,1,,affection
83093,111217,996,24h,I got a positive response to an email.,I got a positive response to an email.,True,1,,achievement
3371,31066,39,24h,Last month i went to meet my distant relation and i had not met him for many years and i was very happy to see him and he greeted me very well and i shared with him how i cared for him and i had lunch in his house and returned evening to my home.,Last month i went to meet my distant relation and i had not met him for many years and i was very happy to see him and he greeted me very well and i shared with him how i cared for him and i had lunch in his house and returned evening to my home.,True,1,,affection
16878,44649,209,24h,I was booking a flight and just before I finalized the reservation I received an email from the airline about a special deal that ended up saving me over $200.,I was booking a flight and just before I finalized the reservation I received an email from the airline about a special deal that ended up saving me over $200.,True,1,achievement,achievement


In [14]:
print("Number of unique reflection periods:", data_happy['reflection_period'].unique().shape[0])
print("which are:", data_happy['reflection_period'].unique())

Number of unique reflection periods: 2
which are: ['24h' '3m']


In [15]:
print("Number of unique types of number of sentences:", data_happy['num_sentence'].unique().shape[0])
print("which are:", data_happy['num_sentence'].unique())

Number of unique types of number of sentences: 47
which are: [ 1  2  3  9  6  4 53  5 12 16  7 13 11 28  8 10 14 17 25 27 37 34 21 23
 19 26 24 22 31 18 58 32 15 29 35 56 51 20 46 42 30 60 69 40 48 44 45]


In [16]:
print("Number of unique ground truth categories:", data_happy['ground_truth_category'].unique().shape[0])
print("which are:", data_happy['ground_truth_category'].unique())

Number of unique ground truth categories: 8
which are: [nan 'bonding' 'leisure' 'affection' 'enjoy_the_moment' 'achievement'
 'nature' 'exercise']


In [17]:
print("Number of unique predicted categories:", data_happy['predicted_category'].unique().shape[0])
print("which are:", data_happy['predicted_category'].unique())

Number of unique predicted categories: 7
which are: ['affection' 'exercise' 'bonding' 'leisure' 'achievement'
 'enjoy_the_moment' 'nature']
