In [147]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import datetime as dt

from astral import LocationInfo
from astral.sun import sun


%matplotlib inline


In [148]:
# Define a working directory
os.chdir('/Users/relic/Documents/mirror-data')

In [149]:
# Import the dataset
df = pd.read_csv('gray_tracking_deidentified.csv', skiprows=2)
#df = df[1:] #take the data less the header row
new_header = ['Date', 'DayOfWeek', 'PositiveRetro', 'NegativeRetro', 'DailyGoals', 'Journal', 
              'Alcohol', 'WhyDrink', 'BreakfastHealth', 'WalkRunDistance', 'LunchHealth', 'RoomClean', 
              'HoursSocial', 'HoursArt', 'DinnerHealth', 'SleepTime', 'WakeTime', 'HoursSleep', 
              'WhereSleep', 'PartnerContact',  'MinutesMusic', 'MorningQuality', 'FocusLevel', 
              'AnxietyLevel', 'HealthLevel', 'Motivationlevel', 'HappyLevel', 'Evening', 'Drugs', 
              'Weight', 'HoursTVGames', 'FinishBook', 'Illness', 'WorkStartTime', 'WorkEndTime', 
              'SecondCoffeeTime', 'CovidPoints', 'CovidPointsActivity']
df.columns = new_header

# Drop a few columns with sensitive data. Data previously removed manually so these columns should be enpty. 
df = df.drop(['PositiveRetro', 'NegativeRetro', 'DailyGoals', 'Journal', 'WhyDrink', 'Evening', 'CovidPointsActivity'], axis=1)
pd.set_option('display.max_columns', None)

# Fill the NaNs in some of the numeric columns with 0
zero_columns = ['CovidPoints', 'Illness']
for column in zero_columns:
    df[column].fillna(0, inplace = True)

# Fill a few specific rows in specific column with NaN
df['Weight'] = df['Weight'].replace(r'^\s*$', np.nan, regex=True)
df['CovidPoints'] = df['CovidPoints'].replace('?', 0)

# # Fill the NaNs in some of the numeric columns with the column average
# avg_columns = ['Alcohol', 'WalkRunDistance', 'HoursSocial', 'HoursArt', 'MinutesMusic', 'Weight', 'HoursTVGames']
# for column in avg_columns:
#     df[column] = df[column].fillna(df.mean())

df = df[:861]
df_clean = df

In [150]:
# Convert the date column to a datetime object

df['Date']= pd.to_datetime(df['Date'])

In [151]:
# Incorporate data regarding the daily hours of daylight at my latitude for this date. 

# Berkeley is at: (37.8° N, , 122.27° W) and is in the "America/Los_Angeles" timezone

# For a list of all the possible time zones run the following

# import pytz
# pytz.all_timezones

# We're going to use sunrise and sunset for this analysis since what we really care about is how many hours of 
# meaningful light in the sky for activity and circadian rhythm purposes. 

city = LocationInfo("Berkeley", "USA", "America/Los_Angeles", 37.8, -122.27)


print((
    f"Information for {city.name}, {city.region}\n"
    f"Timezone: {city.timezone}\n"
    f"Latitude: {city.latitude:.02f}; Longitude: {city.longitude:.02f}\n"
))
date = dt.date(2021, 3, 22)

def Daylight_Calculator (city, in_date, tz):
    s = sun(city, date=in_date, tzinfo = tz)

#     print((
#         f'Dawn:    {s["dawn"]}\n'
#         f'Sunrise: {s["sunrise"]}\n'
#         f'Noon:    {s["noon"]}\n'
#         f'Sunset:  {s["sunset"]}\n'
#         f'Dusk:    {s["dusk"]}\n'
#     ))

    # Dawn:    2009-04-22 04:13:04.923309+00:00
    # Sunrise: 2009-04-22 04:50:16.515411+00:00
    # Noon:    2009-04-22 11:59:02+00:00
    # Sunset:  2009-04-22 19:08:41.215821+00:00
    # Dusk:    2009-04-22 19:46:06.362457+00:00


    dawn_sec = int(s['sunrise'].timestamp())
    sunset_sec = int(s['sunset'].timestamp())
    daylight = sunset_sec - dawn_sec
    return daylight

df['Daylight'] = df.apply(lambda row: Daylight_Calculator(city.observer, row.Date, city.timezone), axis=1)

Information for Berkeley, USA
Timezone: America/Los_Angeles
Latitude: 37.80; Longitude: -122.27



In [152]:
# Engineer a sleep column to have total sleep seconds 

def time_convert(x):
    if x.count(':') > 2:
        return 0
    elif x.count(':') == 2:
        h,m,s = map(int,x.split(':'))
        return (h*60+m)*60+s
    else:
        h,m = map(int,x.split(':'))
        return (h*60+m)*60

df['SecondsSleep'] = df['HoursSleep'].apply(time_convert)

In [153]:
# Add a categorical column for whether I drank coffee 

df['SecondCoffeeTime'] = df['SecondCoffeeTime'].replace("0", np.nan)
df['CaffeinCategorical'] = np.where(df['SecondCoffeeTime'].isnull(), 0, 1)


In [154]:
# Add a categorical column for whether I worked that day

df['WorkStartTime'] = df['WorkStartTime'].replace("0", np.nan)
df['WorkCategorical'] = np.where(df['WorkStartTime'].isnull(), 0, 1)


In [83]:
df.sample(5)

Unnamed: 0,Date,DayOfWeek,Alcohol,BreakfastHealth,WalkRunDistance,LunchHealth,RoomClean,HoursSocial,HoursArt,DinnerHealth,SleepTime,WakeTime,HoursSleep,WhereSleep,PartnerContact,MinutesMusic,MorningQuality,FocusLevel,AnxietyLevel,HealthLevel,Motivationlevel,HappyLevel,Drugs,Weight,HoursTVGames,FinishBook,Illness,WorkStartTime,WorkEndTime,SecondCoffeeTime,CovidPoints,SecondsSleep,CaffeinCategorical,WorkCategorical
482,2/10/20,Monday,1.0,6.0,0.0,7.0,8.0,12.0,1.0,4.0,2:16,7:20,6:40,Home,1.0,20.0,3.0,5.0,7.0,3.0,5.0,5.0,0.0,,2.3,,4.0,8:00,17:22,,0,24000,0,1
316,8/28/19,Wednesday,4.0,5.0,0.0,5.0,3.0,12.0,10.0,5.0,4:00,10:00,7:00,Traveling,0.0,0.0,5.0,7.0,5.0,5.0,7.0,6.0,0.0,,0.0,,0.0,,0,22:00,0,25200,1,0
567,5/5/20,Tuesday,3.0,8.0,1.0,5.0,8.0,6.0,0.5,7.0,0:00,8:20,7:20,Home,0.0,40.0,3.0,6.0,6.0,5.0,6.0,,0.0,,2.0,,0.0,8:30,17:40,,0,26400,0,1
46,12/1/18,Saturday,5.0,9.0,0.0,5.0,5.0,5.0,2.0,5.0,4:00,4:00,6:00:00,Home,0.0,360.0,,6.0,6.0,,,7.0,0.0,,2.0,,0.0,,0,,0,21600,0,0
220,5/24/19,Friday,0.0,5.0,0.0,5.0,6.0,10.0,0.0,4.0,1:30,8:10,6:10,Home,0.0,30.0,,8.0,7.0,,,7.0,0.0,161.8,3.0,,0.0,,1,,0,22200,0,0


# Regression

In [155]:
# Use linear regression to predict daily happiness.  Find days that are outliers and see if there was 
# anything particularly odd about those days. 

# Do the same for anxiety
# https://www.analyticsvidhya.com/blog/2017/06/a-comprehensive-guide-for-linear-ridge-and-lasso-regression/

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
random_state = 42
test_size = .2



In [156]:
# Make sure our dataframe is clean as we start working on it
del df
df = df_clean
df.columns

Index(['Date', 'DayOfWeek', 'Alcohol', 'BreakfastHealth', 'WalkRunDistance',
       'LunchHealth', 'RoomClean', 'HoursSocial', 'HoursArt', 'DinnerHealth',
       'SleepTime', 'WakeTime', 'HoursSleep', 'WhereSleep', 'PartnerContact',
       'MinutesMusic', 'MorningQuality', 'FocusLevel', 'AnxietyLevel',
       'HealthLevel', 'Motivationlevel', 'HappyLevel', 'Drugs', 'Weight',
       'HoursTVGames', 'FinishBook', 'Illness', 'WorkStartTime', 'WorkEndTime',
       'SecondCoffeeTime', 'CovidPoints', 'Daylight', 'SecondsSleep',
       'CaffeinCategorical', 'WorkCategorical'],
      dtype='object')

In [None]:
# Assumptions of Linear Regression

# 1. that the variables are independent from each other
# 2. that the variables are normally distributed
# 3. there is a linear relationship between target variable(s) and regressors (input/independent variables)

# The hypothesis is that: Happiness on a given day is predictable based on other aspects of that day, 
# particularly variables I can control such as my health choices and my activities.


In [None]:
# One-Hot Encode all the categorical variables before splitting

# Day of week

# Hours of sleep

# WhereSleep

# PartnerContact

# Drugs

# SecondCoffeeTime


ids = [0, 1, 2]
countries = ['Sleep_Home', 'Sleep_Partner_House', 'Sleep_Traveling']

df_sleep = pd.get_dummies(df.WhereSleep, prefix='WhereSleep')
df_sleep.sample()


dataframe = pd.concat([dataframe, df2], axis=1)

In [None]:
# Test/Train Split

df = df.drop(['Date', 'DayOfWeek','SleepTime', 'WakeTime', 'HoursSleep', 'WhereSleep', 'PartnerContact', 'Drugs', 
       'FinishBook','WorkStartTime', 'WorkEndTime',
       'SecondCoffeeTime'], axis=1)

# the target variables will be first happiness (HappyLevel) and then anxiety (AnxietyLevel)
X, y = df.drop(['HappyLevel'], axis=1), df['HappyLevel']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state = random_state)

In [None]:
# K-Fold Cross-Validation will replace test/train split



In [None]:
# Scale variables after test/train split
# Citation

# Only scale the input variables unless
# Citation

# Types of scaling
# Citation

# Scaling depends on normal distributions of input data...except it doesn't really.  
# How to tell if the input data was not normal? 



In [None]:
# Fill NaN values with the column mean for each of the test and train sets


# Fill the NaNs in some of the numeric columns with the column average
avg_columns = ['Alcohol', 'WalkRunDistance', 'HoursSocial', 'HoursArt', 'MinutesMusic', 'Weight', 
        'HoursTVGames', 'BreakfastHealth', 'LunchHealth', 'RoomClean', 'DinnerHealth', 'MorningQuality', 
        'FocusLevel', 'AnxietyLevel', 'HealthLevel', 'Motivationlevel']

for column in avg_columns:
    X_train[column].replace(r'^\s*$', np.nan, regex=True, inplace=True)
    col_mean = X_train[column].median()
    X_train[column].fillna(col_mean, inplace=True)
    
    X_test[column].replace(r'^\s*$', np.nan, regex=True, inplace=True)
    col_mean = X_test[column].median()
    X_test[column].fillna(col_mean, inplace=True)
    
y_train_col_mean = y.median()
y_train.fillna(y_train_col_mean, inplace=True)

y_test_col_mean = y.median()
y_test.fillna(y_test_col_mean, inplace=True)




In [None]:
# Training the model

lreg = LinearRegression();
lreg.fit(X_train,y_train);

In [None]:
# measure how well the model does

pred = lreg.predict(X_test)
mse = np.mean((pred - y_test)**2)
print('MSE: ' + str(mse))

# Calculate adjusted MSE


In [None]:
# What do MSE and Adjusted RMSE tell us about the model performancce?  What is a good value here? 



In [None]:
# When should we use MSE or Adjusted RMSE?  What are the problems we avoid by using one or the other? 



In [None]:

# Calculating the coefficients

coeff = pd.DataFrame(X_train.columns)
coeff['Coefficient Estimate'] = pd.Series(lreg.coef_)
coeff

In [None]:
# How should we interpret the coefficients? 
# https://blog.minitab.com/en/adventures-in-statistics-2/how-to-interpret-regression-analysis-results-p-values-and-coefficients


In [None]:
# How do we tell if the coefficients are misleading us? 



In [None]:
# How to tell if there are outliers skewing any of the variables



In [None]:
# What are Lasso, Ridge and Elastic Net and when should each one be used instead of basic linear regression? 



In [None]:
# Is the coefficient the same as the slope? 

# But with scaled variables and with variables that have different scales this doesn't necessarily tell us how 
# big the effect is? 



In [None]:
# How do we tell if variables are non-linear (i.e. overlap in the information they're carrying?)



In [None]:
# If you needed to recommend to Gray one variable to focus on to improve happiness and another to focus on to 
# avoid desroying it, what would each of these be? 