In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import re 
from datetime import date, datetime
import numpy as np
import random
import copy

In [None]:
plt.style.use('seaborn')
font = {'family' : 'normal',
        'weight' : 'bold',
        'size'   : 16}

plt.rc('font', **font)
plt.rcParams.update({'font.size': 16})


# Task 1: Explore a small dataset
### Task 1A: Exploration

In [2]:
df = pd.read_csv("data/ODI-2020.csv", sep=';')

# rename to normal column names for easier referencing later
df.columns=["program", "ml_course", "ir_course", "stat_course", "db_course", "gender",
                "chocolate", "birthday", "neighbors", "standed_up", "stress", "local_dm_comp", "random_number",
                "bedtime_yesterday", "happy1", "happy2"]
display(df)

Unnamed: 0,program,ml_course,ir_course,stat_course,db_course,gender,chocolate,birthday,neighbors,standed_up,stress,local_dm_comp,random_number,bedtime_yesterday,happy1,happy2
0,Computational Science,no,unknown,mu,ja,female,fat,05-08-1998,0,no,0,4,227,22:30,Good weather,Sporting
1,Quantitative Risk Management,no,0,mu,nee,male,unknown,1996,2,no,85,25,65,23:05,chill,coffee
2,MSc Computational Science,no,0,mu,ja,female,neither,12th June,4,no,50,10,3,10pm,Waking up early and preparing for the day by p...,Using the early morning time to study
3,MSc Finance - QRM,no,0,mu,nee,female,neither,19-06-1994,0,no,10,99,2,23,eating good food,spending time with friends
4,BA,yes,0,mu,ja,male,neither,29-09-1993,0,no,60,80,7,22:00,Sun is shining,Intimate contact with friends and family
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275,Mechenaical Engineering,yes,1,mu,nee,male,fat,17-09-1993,2,no,69,5,6,1:00,no corona,no corona 2
276,CS,no,0,mu,ja,male,fat,28/10,0,no,10,100,7,21:45,Netflix,Chill
277,Artificial Intelligence,yes,1,mu,ja,male,neither,26-07-1994,12,yes,60,75,42,13:37,Finishing all tasks,Finding an extra package of ice cream
278,Artificial Intelligence,no,0,mu,nee,female,neither,3/4/1998,1,no,70,2,69,00:00,Food,Orgasm


### Clean

In [None]:
# clean up open questions: master can be CLS/computational science/Computational Science/Master Computational Science, etc.
import string

df = pd.read_csv("data/ODI-2020.csv", sep=';')

# rename to normal column names for easier referencing later
df.columns=["program", "ml_course", "ir_course", "stat_course", "db_course", "gender",
                "chocolate", "birthday", "neighbors", "standed_up", "stress", "local_dm_comp", "random_number",
                "bedtime_yesterday", "happy1", "happy2"]
display(df)


print("Unique progams:", len(df.program.unique()))
print("Changing everything to lowercase")
df.program = df.program.map(str.lower)

print("Unique progams:", len(df.program.unique()))
print("Deleting all punctuation")
# text = text.translate()

df.program = df.program.map(lambda s: s.translate(str.maketrans('','', string.punctuation)))
print("Unique progams:", len(df.program.unique()))


print("\n\n\n")

for index, row in df.iterrows():
    if "computational" in row.program or "cls" in row.program:
        row.program = "CLS"
    elif "artificial" in row.program or "ai" in row.program:
        row.program = "AI"
    elif "computer science" in row.program or "cs" in row.program:
        row.program = "CS"
    elif "quantitative risk management" in row.program or "qrm" in row.program:
        row.program = "QRM"
    
print("Unique progams:", len(df.program.unique()))
print(df.program.unique())

In [None]:
# Clean birthday
df_new = df.copy()
df_new["age"] = np.nan

def calculateAge(birthDate): 
    today = date.today() 
    return today.year - birthDate.year - ((today.month, today.day) < (birthDate.month, birthDate.day)) 

for index,row in df_new.iterrows():
    line = re.split("-|/|th|of|''|' ' ", row["birthday"])
    if len(line) == 1:
        line = line[0].split(".")

    if len(line) == 1:
        if line[0].startswith('199') or line[0].startswith('198'):
            year = int(line[0])

            if year > 1900 and year < 2010:
                date_str = date(year, 1, 1)
                age = calculateAge(date_str)
    
    elif len(line) == 3 and ' ' not in line:
        for i in range(len(line)): 
            if line[i] == "march ": 
                line[i] = 3
        line = [int(i) for i in line] 
        year, month, day = line[2], line[1], line[0]
        
        # For entries of '94' in stead of '1994'
        if year > 60 and year < 1900:
            year = int('19' + str(year))
            
         # Calculate date with year - month - day
        try:
            date_str = date(year, month, day)
            age = calculateAge(date_str)
            
            # Exclude wrong data
            if age == 0:
                continue
    
        except ValueError:
            date_str = np.nan
            age = np.nan
            continue
        
    # Add new column of 'age'
    df_new.birthday[index] = date_str
    df_new.age[index] = age

In [None]:
# Further cleaning

# ml_course
# array(['no', 'yes', 'unknown'], dtype=object)

# ir_course
# array(['unknown', '0', '1'], dtype=object) where 0 is no, 1 is yes

# stat_course
# array(['mu', 'sigma', 'unknown'], dtype=object) mu is yes, sigma is no

# db_course
# array(['ja', 'nee', 'unknown'], dtype=object)

# gender
# array(['female', 'male', 'unknown'], dtype=object)

# chocolate
# array(['fat', 'unknown', 'neither', 'I have no idea what you are talking about', 'slim'], dtype=object)

# neighbors
# convert values to integers if possible, else 'nan'
cleaned_neighbors = []
for nr in df_new['neighbors']:
    try:
        cleaned_neighbors.append(int(nr))
    except:
        cleaned_neighbors.append(np.nan)

# replace old column by cleaned one
df_new['neighbors'] = cleaned_neighbors

# standed_up = array(['no', 'unknown', 'yes'], dtype=object)

# stress
cleaned_stress = []
for nr in df_new['stress']:
    try:
        new = int(nr)
        if new < 0 or new > 100:
            cleaned_stress.append(np.nan)
        else:
            cleaned_stress.append(new)
    except:
        cleaned_stress.append(np.nan)

# replace old column by cleaned one
df_new['stress'] = cleaned_stress

# local_dm_comp
# TODO

# random_number
# Note: check for outliers, they are highly influencing the mean
cleaned_random = []
for index, row in df.iterrows():
    try:
        cleaned_random.append(int(row['random_number']))
    except:
        cleaned_random.append(np.nan)

# replace old column by cleaned one
df_new['random_number'] = cleaned_random


# bedtime_yesterday
# for now only accepting completely numeric (+ ":") entries, that is 151 of 280 entries
# first convert entries to strings
cleaned_bedtime_strings = []
for index,row in df_new.iterrows():
    incorrect = False
    value = str(row['bedtime_yesterday'])
    for letter in value:
        if letter != ':':
            try:
                int(letter)
            except:
                incorrect = True
    if not incorrect:
        cleaned_bedtime_strings.append(value)
    else:
        cleaned_bedtime_strings.append(np.nan)

# convert strings to time objects, else make it nan
time_objects = []
c = 0
for item in cleaned_bedtime_strings:
    if item != item:
        time_objects.append(np.nan)
    elif ':' in item and len(item) == 5:
        try:
            time_objects.append(datetime.strptime(item, '%H:%M').time())
            c += 1
        except:
            time_objects.append(np.nan)
    else:
        time_objects.append(np.nan)

    
# replace old column by cleaned one
df_new['bedtime_yesterday'] = time_objects

    
# happy1 & happy 2 
# voeg columns 'sun' en 'food' toe aan dataframe, 
# sun: 1 = 'sun' of 'weather' komt voor in happy1 of happy2, 0 = komt niet voor
# food: 1 = 'food' of 'choc' komt voor in happy1 of happy2, 0 = komt niet voor
# ik heb ook overwogen te checken of 'corona' veel genoemd wordt, maar dat werd maar 4x genoemd dus niet erin gedaan
sun = []
food = []
c = 0
for index,row in df_new.iterrows():
    if 'sun' in str(row['happy2']).lower() or 'weather' in str(row['happy2']).lower() or 'sun' in str(row['happy1']).lower() or 'weather' in str(row['happy1']).lower():
        sun.append(1)
    else:
        sun.append(0)
    if 'food' in str(row['happy1']).lower() or 'food' in str(row['happy2']).lower() or 'choc' in str(row['happy1']).lower() or 'choc' in str(row['happy2']).lower():
        food.append(1)
    else:
        food.append(0)

df_new['sun'] = sun
df_new['food'] = food

In [None]:
# Cut into stress categories
# df_new['stress_band'] = pd.cut(df_new['stress'], 6)

In [None]:
df_new.loc[ df_new['stress'] <= 10, 'stress'] = 0
df_new.loc[(df_new['stress'] > 10) & (df_new['stress'] <= 20), 'stress'] = 1
df_new.loc[(df_new['stress'] > 20) & (df_new['stress'] <= 30), 'stress'] = 2
df_new.loc[(df_new['stress'] > 30) & (df_new['stress'] <= 40), 'stress'] = 3
df_new.loc[(df_new['stress'] > 40) & (df_new['stress'] <= 50), 'stress'] = 4
df_new.loc[(df_new['stress'] > 50) & (df_new['stress'] <= 60), 'stress'] = 5
df_new.loc[(df_new['stress'] > 60) & (df_new['stress'] <= 70), 'stress'] = 6
df_new.loc[(df_new['stress'] > 70) & (df_new['stress'] <= 80), 'stress'] = 7
df_new.loc[(df_new['stress'] > 80) & (df_new['stress'] <= 90), 'stress'] = 8
df_new.loc[ df_new['stress'] > 90, 'stress'] = 9

In [None]:
# convert time objects to integers


In [None]:
display(df_new)

### Explore

In [None]:
import os
if not os.path.exists('results/odi/'):
    os.makedirs('results/odi/')
    
from collections import Counter


def show_save_histogram(df, column_name):
    """ Makes and saves a histogram of a specific df column. """
    df.hist(column=column_name)
    plt.savefig("results/odi/" + column_name + ".png")



def make_bar_chart(df, column_name):
    """ Makes a bar chart given a data frame column. """
    
    freqs = Counter(df[column_name])
    xvals = range(len(freqs.values()))
    
    plt.figure()
    plt.bar(xvals, freqs.values())
    plt.xticks(xvals, freqs.keys())
    
    plt.savefig("results/odi/" + column_name + ".png")

show_save_histogram(df_new, "age")


make_bar_chart(df, "gender")


In [None]:
# Descriptives

# Age
mean_age = df_new.loc[:,"age"].mean()
print(f"mean age: {mean_age}")
std_age = df_new.loc[:,"age"].std()
print(f"standard deviation age: {std_age}")

# Neighbors
mean_neighbors = df_new.loc[:,"neighbors"].mean()
print(f"mean neighbors: {mean_neighbors}")
std_neighbors = df_new.loc[:,"neighbors"].std()
print(f"standard deviation neighbors: {std_neighbors}")

# Stress
mean_stress = df_new.loc[:,"stress"].mean()
print(f"mean stress: {mean_stress}")
std_stress = df_new.loc[:,"stress"].std()
print(f"standard deviation stress: {std_stress}")

# Random number
mean_random = df_new.loc[:,"random_number"].mean()
print(f"mean random: {mean_random}")
std_random = df_new.loc[:,"random_number"].std()
print(f"standard deviation random: {std_random}")

### Explore correlations with gender

In [None]:
# Average stress for each gender
ax = plt.subplot()
ax.set_ylabel('Average stress levels')
ax.set_title('Average stress levels for each gender')
df_new.groupby('gender').mean()['stress'].plot(kind='bar', 
                                               figsize=(5, 6), 
                                               ax = ax,
                                               color="maroon");

In [None]:
# Average age for each gender
ax = plt.subplot()
ax.set_ylabel('Average age')
ax.set_title('Average age for each gender')
df_new.groupby('gender').mean()['age'].plot(kind='bar', 
                                               figsize=(5, 6), 
                                               ax = ax,
                                               color="maroon");

In [None]:
# Convert chocolate categories
try:
    df_new['gender'] = df_new['gender'].map( {'female': 0, 'male': 1, 'unknown': 2} ).astype(int)
    df_new['chocolate'] = df_new['chocolate'].map( {'fat': 0, 'neither': 1, 'I have no idea what you are talking about':2, 'slim':3, 'unknown': 4} ).astype(int)
except ValueError:
    print("already converted to int")
    
col_male, col_female = [], []
for index, row in df_new.iterrows():
    if row["gender"] == 1:
        col_male.append(1)
    else:
        col_male.append(0)
    
    if row["gender"] == 0:
        col_female.append(1)
    else:
        col_female.append(0)
    
try:
    df_new.insert(loc=0, column='male', value=pd.Series(col_male))
    df_new.insert(loc=0, column='female', value=pd.Series(col_female))   
except ValueError:
    print("Already added col")


plot = df_new.groupby('program').agg('sum')[['female', 'male']].plot(kind='bar', 
                                                                    fontsize=14, 
                                                                    figsize=(15, 6),
                                                                    stacked=True, 
                                                                    color=['black', 'grey'],
                                                                    title="program based on gender");

In [None]:
plot = df_new.groupby('chocolate').agg('sum')[['female', 'male']].plot(kind='bar', 
                                                                    fontsize=14, 
                                                                    figsize=(5, 6),
                                                                    stacked=True, 
                                                                    color=['black', 'grey'],
                                                                    title="program based on gender");

In [None]:
plot = df_new.groupby('stress').agg('sum')[['female', 'male']].plot(kind='bar', 
                                                                    fontsize=14, 
                                                                    figsize=(15, 6),
                                                                    stacked=True, 
                                                                    color=['black', 'grey'],
                                                                    title="program based on gender");

In [None]:
plot = df_new.groupby('age').agg('sum')[['female', 'male']].plot(kind='bar', 
                                                                    fontsize=14, 
                                                                    figsize=(12, 6),
                                                                    stacked=False, 
                                                                    color=['black', 'grey'],
                                                                    title="program based on gender");

In [None]:
df_new.groupby('gender')['age'].mean() # 0 = female, 1 = male, 2 = unknown



In [None]:
df_new.groupby('age').agg('sum')[['female', 'male']]

In [None]:
display(df_new)

In [3]:
# train & return a classifier
def train(df):
    return False
    
# use a given classifier and test its performance, return a performance measure
def test(df, classifier):
    outcome = 5
    return outcome

# k-fold cross validation, takes as an argument a pandas dataframe
def kfold(df):

    # determine amount of folds
    k = 2

    # keep track of created datasets
    folds = []

    # substract k subsets
    for i in range(k):
        drop_indices = np.random.choice(df.index, int((1/k)*len(df)), replace=False)
        folds.append(df.drop(drop_indices))
        df.drop(drop_indices, inplace=True)

    # keep track of performances
    performances = []
    
    # for each subset (fold)
    for j in range(len(folds)):

        # take this fold as test set
        test_data = folds[j]

        # take remaining groups as training set
        train_dataframes = []
        
        for k in range(len(folds)):
            if k != j:
                train_dataframes.append(folds[k])
                
        train_data = pd.concat(train_dataframes)
        
        # train & test
        solution = train(train_data)
        performances.append(test(test_data, solution))
        
        
    # compute overall performance
    print(performances)
    return mean(performances)

kfold(df)

2
[5, 5]


AttributeError: 'list' object has no attribute 'mean'