In [74]:
import pandas as P
import numpy as N
import scipy.stats as S
import matplotlib.pyplot as MP
from itertools import combinations

In [75]:
# Loading the data
df = P.read_csv('habits.data', sep=';')
df

Unnamed: 0,kohde,jasen,pvknro,sp,ASALUE,IKAL1,A1,A2,A3,A4,A5
0,50002,1,1,1,1.0,49,0,560,0,80,1.0
1,50002,1,2,1,1.0,49,380,450,10,0,1.0
2,50003,1,1,2,2.0,41,0,470,30,100,1.0
3,50003,1,2,2,2.0,41,0,550,0,0,1.0
4,50004,2,1,1,1.0,62,640,410,0,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
740,51980,1,2,2,2.0,50,460,450,31,0,2.0
741,51981,2,1,1,1.0,35,0,470,0,140,?
742,51981,2,2,1,1.0,35,0,730,?,0,?
743,51983,1,1,2,3.0,66,560,375,20,0,1.0


In [76]:
# Cleaning the data
df.replace('?', P.NA, inplace=True)
df['A1'] = df['A1'].fillna(0)
df['A2'] = df['A2'].fillna(0)
df['A3'] = df['A3'].fillna(0)
df['A4'] = df['A4'].fillna(0)
df['A5'] = df['A5'].fillna(2.0)

# Handeling "HH:MM" formated data
def to_min(x):
    if isinstance(x, str) and ':' in x:
        try:
            h, m = x.split(':')
            return int(h) * 60 + int(m)
        except:
            return int(0)
    else:
        return x
        
for col in ['A1', 'A2', 'A3', 'A4']:
    df[col] = df[col].apply(to_min)

# Converting activity columns to numeric
for col in ['A1', 'A2', 'A3', 'A4', 'A5']:
    df[col] = P.to_numeric(df[col])

# Renaming the activity columns
df.rename(columns={
    'A1': 'Working',
    'A2': 'Sleeping',
    'A3': 'Reading',
    'A4': 'Dining at restaurant',
    'A5': 'Visiting library'
}, inplace=True)

df.head(59)

Unnamed: 0,kohde,jasen,pvknro,sp,ASALUE,IKAL1,Working,Sleeping,Reading,Dining at restaurant,Visiting library
0,50002,1,1,1,1.0,49,0,560,0,80,1.0
1,50002,1,2,1,1.0,49,380,450,10,0,1.0
2,50003,1,1,2,2.0,41,0,470,30,100,1.0
3,50003,1,2,2,2.0,41,0,550,0,0,1.0
4,50004,2,1,1,1.0,62,640,410,0,0,1.0
5,50004,2,2,1,1.0,62,0,550,72,108,1.0
6,50005,1,1,2,1.0,46,0,540,40,0,2.0
7,50005,1,2,2,1.0,46,0,550,52,108,2.0
8,50006,1,1,2,2.0,33,0,540,0,90,1.0
9,50006,1,2,2,2.0,33,0,530,62,0,1.0


In [77]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 745 entries, 0 to 744
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   kohde                 745 non-null    int64  
 1   jasen                 745 non-null    int64  
 2   pvknro                745 non-null    int64  
 3   sp                    745 non-null    int64  
 4   ASALUE                745 non-null    float64
 5   IKAL1                 745 non-null    int64  
 6   Working               745 non-null    int64  
 7   Sleeping              745 non-null    int64  
 8   Reading               745 non-null    int64  
 9   Dining at restaurant  745 non-null    int64  
 10  Visiting library      745 non-null    float64
dtypes: float64(2), int64(9)
memory usage: 64.1 KB


In [78]:
# Checking normality of activities
print(S.shapiro(df['Working']).pvalue, end="\n\n")
print(S.shapiro(df['Sleeping']).pvalue, end="\n\n")
print(S.shapiro(df['Reading']).pvalue, end="\n\n")
print(S.shapiro(df['Dining at restaurant']).pvalue, end="\n\n")

1.9062798325097924e-37

1.3646572544652781e-17

9.740293030594202e-35

1.2684985464447803e-31



In [79]:
# Since p < 0.05 in every activity, every activity is not normally distributed
# For analysing correlation we will use spearman correlation
continuous = ['Working', 'Sleeping', 'Reading', 'Dining at restaurant']

print("=== Spearman correlations (minutes vs minutes) ===\n")
for a, b in combinations(continuous, 2):
    r, p = S.spearmanr(df[a], df[b], nan_policy='omit')
    print(f"{a} ↔ {b}: Spearman r = {r:.3f}, p = {p:.4f}")

for a in continuous:
    yes = df.loc[df['Visiting library'] == 1, a].dropna()
    no = df.loc[df['Visiting library'] == 2, a].dropna()
    stat, p = S.mannwhitneyu(yes, no, alternative='two-sided')
    print(f"{a} ↔ Visiting library: p = {p:.4f}")

=== Spearman correlations (minutes vs minutes) ===

Working ↔ Sleeping: Spearman r = -0.385, p = 0.0000
Working ↔ Reading: Spearman r = -0.182, p = 0.0000
Working ↔ Dining at restaurant: Spearman r = -0.020, p = 0.5903
Sleeping ↔ Reading: Spearman r = 0.008, p = 0.8300
Sleeping ↔ Dining at restaurant: Spearman r = 0.023, p = 0.5375
Reading ↔ Dining at restaurant: Spearman r = -0.049, p = 0.1817
Working ↔ Visiting library: p = 0.4243
Sleeping ↔ Visiting library: p = 0.6322
Reading ↔ Visiting library: p = 0.4736
Dining at restaurant ↔ Visiting library: p = 0.4721
