In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import re 
from datetime import date, datetime
import numpy as np

# Task 1: Explore a small dataset
### Task 1A: Exploration

In [None]:
df = pd.read_csv("data/ODI-2020.csv", sep=';')

# rename to normal column names for easier referencing later
df.columns=["program", "ml_course", "ir_course", "stat_course", "db_course", "gender",
                "chocolate", "birthday", "neighbors", "standed_up", "stress", "local_dm_comp", "random_number",
                "bedtime_yesterday", "happy1", "happy2"]
display(df)

### Clean

In [None]:
# clean up open questions: master can be CLS/computational science/Computational Science/Master Computational Science, etc.
import string

df = pd.read_csv("data/ODI-2020.csv", sep=';')

# rename to normal column names for easier referencing later
df.columns=["program", "ml_course", "ir_course", "stat_course", "db_course", "gender",
                "chocolate", "birthday", "neighbors", "standed_up", "stress", "local_dm_comp", "random_number",
                "bedtime_yesterday", "happy1", "happy2"]
display(df)


print("Unique progams:", len(df.program.unique()))
print("Changing everything to lowercase")
df.program = df.program.map(str.lower)

print("Unique progams:", len(df.program.unique()))
print("Deleting all punctuation")
# text = text.translate()

df.program = df.program.map(lambda s: s.translate(str.maketrans('','', string.punctuation)))
print("Unique progams:", len(df.program.unique()))


print("\n\n\n")

for index, row in df.iterrows():
    if "computational" in row.program or "cls" in row.program:
        row.program = "CLS"
    elif "artificial" in row.program or "ai" in row.program:
        row.program = "AI"
    elif "computer science" in row.program or "cs" in row.program:
        row.program = "CS"
    elif "quantitative risk management" in row.program or "qrm" in row.program:
        row.program = "QRM"
    
print("Unique progams:", len(df.program.unique()))
print(df.program.unique())

In [None]:
# Clean birthday
df_new = df.copy()
df_new["age"] = np.nan

def calculateAge(birthDate): 
    today = date.today() 
    return today.year - birthDate.year - ((today.month, today.day) < (birthDate.month, birthDate.day)) 

for index,row in df.iterrows():
    line = re.split("-|/|th|of|''|' ' ", row["birthday"])
    if len(line) == 1:
        line = line[0].split(".")

    if len(line) == 1:
        if line[0].startswith('199') or line[0].startswith('198'):
            year = int(line[0])

            if year > 1900 and year < 2010:
                date_str = date(year, 1, 1)
                age = calculateAge(date_str)
    
    elif len(line) == 3 and ' ' not in line:
        for i in range(len(line)): 
            if line[i] == "march ": 
                line[i] = 3
        line = [int(i) for i in line] 
        year, month, day = line[2], line[1], line[0]
        
        # For entries of '94' in stead of '1994'
        if year > 60 and year < 1900:
            year = int('19' + str(year))
            
         # Calculate date with year - month - day
        try:
            date_str = date(year, month, day)
            age = calculateAge(date_str)
            
            # Exclude wrong data
            if age == 0:
                continue
    
        except ValueError:
            date_str = np.nan
            age = np.nan
            continue
        
    # Add new column of 'age'
    df_new.birthday[index] = date_str
    df_new.age[index] = age

In [None]:
# Further cleaning

# ml_course
# array(['no', 'yes', 'unknown'], dtype=object)

# ir_course
# array(['unknown', '0', '1'], dtype=object) where 0 is no, 1 is yes

# stat_course
# array(['mu', 'sigma', 'unknown'], dtype=object) mu is yes, sigma is no

# db_course
# array(['ja', 'nee', 'unknown'], dtype=object)

# gender
# array(['female', 'male', 'unknown'], dtype=object)

# chocolate
# array(['fat', 'unknown', 'neither', 'I have no idea what you are talking about', 'slim'], dtype=object)

# neighbors
# convert values to integers if possible, else 'nan'
cleaned_neighbors = []
for nr in df_new['neighbors']:
    try:
        cleaned_neighbors.append(int(nr))
    except:
        cleaned_neighbors.append(np.nan)

# replace old column by cleaned one
df_new['neighbors'] = cleaned_neighbors

# standed_up = array(['no', 'unknown', 'yes'], dtype=object)

# stress
cleaned_stress = []
for nr in df_new['stress']:
    try:
        new = int(nr)
        if new < 0 or new > 100:
            cleaned_stress.append(np.nan)
        else:
            cleaned_stress.append(new)
    except:
        cleaned_stress.append(np.nan)

# replace old column by cleaned one
df_new['stress'] = cleaned_stress

# local_dm_comp
# TODO

# random_number
# Note: check for outliers, they are highly influencing the mean
cleaned_random = []
for index, row in df.iterrows():
    try:
        cleaned_random.append(int(row['random_number']))
    except:
        cleaned_random.append(np.nan)

# replace old column by cleaned one
df_new['random_number'] = cleaned_random


# bedtime_yesterday
# for now only accepting completely numeric (+ ":") entries, that is 151 of 280 entries
# first convert entries to strings
cleaned_bedtime_strings = []
for index,row in df_new.iterrows():
    incorrect = False
    value = str(row['bedtime_yesterday'])
    for letter in value:
        if letter != ':':
            try:
                int(letter)
            except:
                incorrect = True
    if not incorrect:
        cleaned_bedtime_strings.append(value)
    else:
        cleaned_bedtime_strings.append(np.nan)

# convert strings to time objects, else make it nan
time_objects = []
c = 0
for item in cleaned_bedtime_strings:
    if item != item:
        time_objects.append(np.nan)
    elif ':' in item and len(item) == 5:
        try:
            time_objects.append(datetime.strptime(item, '%H:%M').time())
            c += 1
        except:
            time_objects.append(np.nan)
    else:
        time_objects.append(np.nan)

    
# replace old column by cleaned one
df_new['bedtime_yesterday'] = time_objects

    
# happy1 & happy 2 
# voeg columns 'sun' en 'food' toe aan dataframe, 
# sun: 1 = 'sun' of 'weather' komt voor in happy1 of happy2, 0 = komt niet voor
# food: 1 = 'food' of 'choc' komt voor in happy1 of happy2, 0 = komt niet voor
# ik heb ook overwogen te checken of 'corona' veel genoemd wordt, maar dat werd maar 4x genoemd dus niet erin gedaan
sun = []
food = []
c = 0
for index,row in df_new.iterrows():
    if 'sun' in str(row['happy2']).lower() or 'weather' in str(row['happy2']).lower() or 'sun' in str(row['happy1']).lower() or 'weather' in str(row['happy1']).lower():
        sun.append(1)
    else:
        sun.append(0)
    if 'food' in str(row['happy1']).lower() or 'food' in str(row['happy2']).lower() or 'choc' in str(row['happy1']).lower() or 'choc' in str(row['happy2']).lower():
        food.append(1)
    else:
        food.append(0)

df_new['sun'] = sun
df_new['food'] = food


In [None]:
display(df_new)

### Explore

In [None]:
import os
if not os.path.exists('results/odi/'):
    os.makedirs('results/odi/')
    
from collections import Counter


def show_save_histogram(df, column_name):
    """ Makes and saves a histogram of a specific df column. """
    df.hist(column=column_name)
    plt.savefig("results/odi/" + column_name + ".png")



def make_bar_chart(df, column_name):
    """ Makes a bar chart given a data frame column. """
    
    freqs = Counter(df[column_name])
    xvals = range(len(freqs.values()))
    
    plt.figure()
    plt.bar(xvals, freqs.values())
    plt.xticks(xvals, freqs.keys())
    
    plt.savefig("results/odi/" + column_name + ".png")

show_save_histogram(df_new, "age")


make_bar_chart(df, "gender")


In [None]:
# Descriptives

# Age
mean_age = df_new.loc[:,"age"].mean()
print(f"mean age: {mean_age}")
std_age = df_new.loc[:,"age"].std()
print(f"standard deviation age: {std_age}")

# Neighbors
mean_neighbors = df_new.loc[:,"neighbors"].mean()
print(f"mean neighbors: {mean_neighbors}")
std_neighbors = df_new.loc[:,"neighbors"].std()
print(f"standard deviation neighbors: {std_neighbors}")

# Stress
mean_stress = df_new.loc[:,"stress"].mean()
print(f"mean stress: {mean_stress}")
std_stress = df_new.loc[:,"stress"].std()
print(f"standard deviation stress: {std_stress}")

# Random number
mean_random = df_new.loc[:,"random_number"].mean()
print(f"mean random: {mean_random}")
std_random = df_new.loc[:,"random_number"].std()
print(f"standard deviation random: {std_random}")