In [1]:
import pandas as pd
import numpy as np
from random import uniform, randint

In [None]:
df = pd.read_csv('ODI-2020.csv', sep=';')

# Manual cleaning of the data due to the small size of the dataset

# Analysis of the first column: What programme are you in
df['What programme are you in?'] = [x.lower() for x in df['What programme are you in?']]

# Basic symbol and word replacement
df['What programme are you in?']= df['What programme are you in?'].str.replace('&', 'and', case = False) 
df['What programme are you in?']= df['What programme are you in?'].str.replace('qrm', 'quantitative risk management', case = False)
df['What programme are you in?'] = df['What programme are you in?'].map(lambda x: x.lstrip('').rstrip(' '))
df['What programme are you in?']= df['What programme are you in?'].str.replace('cs', 'business analytics', case = False) 

# Simplifying programme names
df['What programme are you in?'][df['What programme are you in?'].str.contains('quantitative risk management', regex=False)] = 'quantitative risk management'
df['What programme are you in?'][df['What programme are you in?'].str.contains('econometrics', regex=False)] = 'econometrics'
df['What programme are you in?'] = df['What programme are you in?'].str.replace('eor', 'econometrics', case = False)
df['What programme are you in?'][df['What programme are you in?'].str.contains('computer science', regex=False)] = 'computer science'
df['What programme are you in?'][df['What programme are you in?'].str.contains('computational science', regex=False)] = 'computational science'

df['What programme are you in?'][df['What programme are you in?'].str.contains('bioinformatics', regex=False)] = 'bioinformatics'
df['What programme are you in?'][df['What programme are you in?'].str.contains('digital business and innovation', regex=False)] = 'business administration'
df['What programme are you in?'][df['What programme are you in?'].str.contains('econometrics and operations research', regex=False)] = 'econometrics and operations research'
df['What programme are you in?']= df['What programme are you in?'].str.replace('ba', 'business analytics', case = False) 
df['What programme are you in?'][df['What programme are you in?'].str.contains('business analytics', regex=False)] = 'business analytics'
df['What programme are you in?']= df['What programme are you in?'].str.replace('ai', 'artificial intelligence', case = False) 
df['What programme are you in?'][df['What programme are you in?'].str.contains('artificial intelligence', regex=False)] = 'artificial intelligence'
df['What programme are you in?'][df['What programme are you in?'].str.contains('information science', regex=False)] = 'information sciences'
df['What programme are you in?'][df['What programme are you in?'].str.contains('information studies', regex=False)] = 'information studies'
df['What programme are you in?'][df['What programme are you in?'].str.contains('language', regex=False)] = 'human language technology'

# Treating special cases
df['What programme are you in?'][df['What programme are you in?'].str.contains('parallel and distributed computer systems', regex=False)] = 'parallel and distributed computer systems'
df['What programme are you in?']= df['What programme are you in?'].str.replace('exchange', 'erasmus', case = False) 
df['What programme are you in?'][df['What programme are you in?'].str.contains('finance and technology', regex=False)] = 'finance and technology'
df['What programme are you in?']= df['What programme are you in?'].str.replace('cls', 'computational science', case = False) 
df['What programme are you in?']= df['What programme are you in?'].str.replace('cps', 'computational science', case = False) 
df['What programme are you in?'][df['What programme are you in?'].str.contains('master of finance', regex=False)] = 'finance'
df['What programme are you in?'][df['What programme are you in?'].str.contains('movement', regex=False)] = 'human movement sciences'
df['What programme are you in?'][df['What programme are you in?'].str.contains('datascience', regex=False)] = 'data science'
df['What programme are you in?'][df['What programme are you in?'].str.contains('forensic', regex=False)] = 'forensic science'

# Print the final programme list
program = df['What programme are you in?'].unique().tolist()
count = df['What programme are you in?'].value_counts()

# Modify the dataset to yes/no answers
df['Have you taken a course on information retrieval?']= df['Have you taken a course on information retrieval?'].str.replace('1', 'yes', case = False) 
df['Have you taken a course on information retrieval?']= df['Have you taken a course on information retrieval?'].str.replace('0', 'no', case = False) 
df['Have you taken a course on statistics?']= df['Have you taken a course on statistics?'].str.replace('mu', 'yes', case = False) 
df['Have you taken a course on statistics?']= df['Have you taken a course on statistics?'].str.replace('sigma', 'no', case = False) 
df['Have you taken a course on databases?']= df['Have you taken a course on databases?'].str.replace('ja', 'yes', case = False) 
df['Have you taken a course on databases?']= df['Have you taken a course on databases?'].str.replace('nee', 'no', case = False) 

# Select the categorical columns
# To add later:
# 'What programme are you in?','What is your stress level (0-100)?'
df2 = df[['What programme are you in?','Have you taken a course on machine learning?','Have you taken a course on information retrieval?','Have you taken a course on statistics?','Have you taken a course on databases?','What is your gender?']]

df = df[~df['What is your stress level (0-100)?'].str.contains("-")]
df['Stress level'] = pd.to_numeric(df['What is your stress level (0-100)?'],errors='coerce')
drop = df[ (df['Stress level'] <= 0) & (df['Stress level'] >= 100) ].index
df.drop(drop, inplace=True)

# Only use the students that have answers all questions with yes/no
df2 = df2.replace('unknown',np.NaN)
df2 = df2.dropna()
df2

df3 = df2.drop(['What programme are you in?'], axis=1)
df4 = pd.get_dummies(df3, drop_first=True)

# Replacing the time column with useable integers
# Only grabs first two characters of each cell
df["Time you went to be Yesterday"] = df["Time you went to be Yesterday"].str[0:2]
df["Time you went to be Yesterday"].replace("?", "NaN", inplace=True)

# Removes rows containing letters
df = df[~df["Time you went to be Yesterday"].str.contains("[a-z, A, ]", na=False)].dropna()

# Strips away unwanted characters & converts to integers
df["Time you went to be Yesterday"] = df["Time you went to be Yesterday"].map(lambda x: x.rstrip(':,?'))
df['Bedtime Integer'] = pd.to_numeric(df["Time you went to be Yesterday"],errors='coerce')

# Converts to 1 to 12 hours
df['Bedtime Integer'].loc[df['Bedtime Integer'].gt(12)] -= 12

# Only using the columns we need
df_new = df[["What is your gender?", "Stress level", "Bedtime Integer"]]
df_new = pd.get_dummies(df_new, drop_first=True)
df_new