In [1]:
import numpy as np
import seaborn as sns

import pandas as pd
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', None)  
import matplotlib.pyplot as plt
from tabulate import tabulate

import scipy

import plotly.express as px
import plotly
import statsmodels.stats.api as sms

import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv("ufm_samlet_18nov2019.csv", delimiter=";", decimal=",")
len(data)
data = data.loc[(data["hovedinsttx"] == "Uddannelsen på landsplan")]
#data = data.loc[(data["afbrud_n"] > 100)]
data["maanedloen_nyudd"] = data["maanedloen_nyudd"].astype(float) * 1000
data["maanedloenp75_nyudd"] = data["maanedloenp75_nyudd"].astype(float) * 1000
data["maanedloenp25_nyudd"] = data["maanedloenp25_nyudd"].astype(float) * 1000

# Løn, Relevans 
Vi ser på sammenhængen mellem lønnen, og hvad eleverne mener om uddannelserne.

In [10]:
def nice_plot(x, y, show_us = False):

    x_konf = sms.DescrStatsW(data_løn[x]).tconfint_mean()
    y_konf = sms.DescrStatsW(data_løn[y]).tconfint_mean()
    # print("x, konfidens: ", x_konf), print("y, konfidens: ", y_konf)
    
    title = "{0} vs. {1}".format(x, y)
    
    line = dict(color=None, width=0)
    fill_color = "LightSkyBlue"
    
    fig = px.scatter(data_løn, x=x, y=y, hover_data=['Titel'], marginal_y="histogram", marginal_x="histogram", trendline='lowess', title=title)
    fig.add_shape(plotly.graph_objs.layout.Shape( type='rect', x0 = x_konf[0], x1 = x_konf[1], y0 = data_løn[y].min(), y1 = data_løn[y].max(), line = line, fillcolor = fill_color, opacity=0.5, layer="below"))
    fig.add_shape(plotly.graph_objs.layout.Shape( type='rect', x0 = data_løn[x].min(), x1 = data_løn[x].max(), y0 = y_konf[0], y1 = y_konf[1], line = line, fillcolor = fill_color, opacity=0.5, layer="below"))    
    
    if show_us:        

        size_x = (data_løn[x].max() - data_løn[x].min()) / 40
        size_y = (data_løn[y].max() - data_løn[y].min()) / 20

        hmm = data_løn.loc[data_løn["Titel"] == "Økonomi"]
        fig.add_shape(plotly.graph_objs.layout.Shape(type='circle', x0 = hmm[x].values[0] - size_x, x1 = hmm[x].values[0] + size_x, y0 = hmm[y].values[0] - size_y, y1 = hmm[y].values[0] + size_y, line_color="Yellow"))

        #hmm = data_løn.loc[data_løn["Titel"] == "Digitale medieteknologier"]
        #fig.add_shape(plotly.graph_objs.layout.Shape(type='circle', x0 = hmm[x].values[0] - size_x, x1 = hmm[x].values[0] + size_x, y0 = hmm[y].values[0] - size_y, y1 = hmm[y].values[0] + size_y, line_color="Green"))

        #hmm = data_løn.loc[data_løn["Titel"] == "Anvendt kemi"]
        #fig.add_shape(plotly.graph_objs.layout.Shape(type='circle', x0 = hmm[x].values[0] - size_x, x1 = hmm[x].values[0] + size_x, y0 = hmm[y].values[0] - size_y, y1 = hmm[y].values[0] + size_y, line_color="Red"))

        #hmm = data_løn.loc[data_løn["Titel"] == "Fysik"]
        #fig.add_shape(plotly.graph_objs.layout.Shape(type='circle', x0 = hmm[x].values[0] - size_x, x1 = hmm[x].values[0] + size_x, y0 = hmm[y].values[0] - size_y, y1 = hmm[y].values[0] + size_y, line_color="Orange"))

    
    fig.update_shapes(dict(xref='x', yref='y'))
    fig.show()

In [11]:
sort_by = ['stress_eksamen_likert']

parameters = ["Titel", "afbrud", "ledighed_nyudd", "stress_daglig_likert",  "stress_eksamen_likert", "kvalitet_likert", "relevans_overens_udd_job_likert", "undervisere_faglighed_likert", "socialtmotivation_likert", "tidsforbrug_p50"] #  "maanedloen_nyudd",

data_løn = data
data_løn["l_diff"] = data_løn["maanedloenp75_nyudd"] - data_løn["maanedloenp25_nyudd"]

data_løn = data_løn.sort_values(by=sort_by)[parameters].dropna()#subset=[sort_by])
column_dict = {"Titel": "Titel", "ensom_likert": "ensom", "relevans_overens_udd_job_likert": "relevans", "afbrud": "frafald 1. år", "ledighed_nyudd": "ledighed", "stress_daglig_likert": "stress", "kvalitet_likert": "kvalitet", "undervisere_faglighed_likert": "faglighed", "maanedloen_nyudd": "månedsløn", "tidsforbrug_p50": "tidsforbrug", "stress_eksamen_likert": "eksamensstress", "socialtmotivation_likert": "socialt"}
data_løn = data_løn.rename(columns=column_dict)

data_løn_means_removed = data_løn
#for parameter in data_løn.columns.values[1:]:
#    data_løn_means_removed[parameter] = data_løn_means_removed[parameter] - data_løn_means_removed[parameter].mean() 

smallest = data_løn[:10]
largest = data_løn[-10:]
print(sort_by, len(data_løn))
print(tabulate([data_løn.mean(), data_løn.median(), data_løn.std()], headers=data_løn.mean().index))
corr = data_løn.corr()
corr.style.background_gradient(cmap='coolwarm')

['stress_eksamen_likert'] 244
  frafald 1. år    ledighed    stress    eksamensstress    kvalitet    relevans    faglighed    socialt    tidsforbrug
---------------  ----------  --------  ----------------  ----------  ----------  -----------  ---------  -------------
        9.37295    12.0697   3.51594           3.00488     4.17012     3.65316      4.4857      3.82832       38.0246
        8          12        3.53              2.97        4.155       3.64         4.505       3.85          38
        6.27722     7.56857  0.254053          0.326687    0.276233    0.273534     0.194443    0.25665        4.43743


Unnamed: 0,frafald 1. år,ledighed,stress,eksamensstress,kvalitet,relevans,faglighed,socialt,tidsforbrug
frafald 1. år,1.0,-0.114193,0.351384,0.275258,-0.486787,-0.251887,-0.356977,-0.173333,-0.0453171
ledighed,-0.114193,1.0,-0.203172,-0.0971037,-0.213886,-0.384305,0.0662817,-0.137094,-0.315755
stress,0.351384,-0.203172,1.0,0.777615,-0.218726,-0.0965902,-0.204638,0.0385148,-0.156202
eksamensstress,0.275258,-0.0971037,0.777615,1.0,-0.141241,0.00463951,-0.169286,0.173803,-0.0505138
kvalitet,-0.486787,-0.213886,-0.218726,-0.141241,1.0,0.551215,0.677587,0.401041,0.263258
relevans,-0.251887,-0.384305,-0.0965902,0.00463951,0.551215,1.0,0.31932,0.277762,0.358081
faglighed,-0.356977,0.0662817,-0.204638,-0.169286,0.677587,0.31932,1.0,0.122947,-0.0177385
socialt,-0.173333,-0.137094,0.0385148,0.173803,0.401041,0.277762,0.122947,1.0,0.330883
tidsforbrug,-0.0453171,-0.315755,-0.156202,-0.0505138,0.263258,0.358081,-0.0177385,0.330883,1.0


In [12]:
data_løn.loc[data_løn["stress"] - data_løn["eksamensstress"] < 0]

Unnamed: 0,Titel,frafald 1. år,ledighed,stress,eksamensstress,kvalitet,relevans,faglighed,socialt,tidsforbrug
200,Bæredygtig energi,5.0,12.0,3.37,3.4,4.51,3.62,4.33,4.04,44.0
2449,Grafisk kommunikation,2.0,19.0,3.37,3.45,4.36,4.19,4.53,3.83,48.0
812,Visuel kommunikation,2.0,16.0,3.51,3.54,4.59,4.17,4.61,4.34,42.0
2526,Tv- og medietilrettelæggelse,4.0,14.0,3.67,3.86,4.13,3.53,4.24,4.14,37.0
386,Lærer fra Den frie Lærerskole,7.0,19.0,4.11,4.83,4.63,4.15,4.69,4.67,35.0


In [13]:
print(tabulate([smallest.mean(), smallest.median(), smallest.std()], headers=smallest.mean().index))
smallest

  frafald 1. år    ledighed    stress    eksamensstress    kvalitet    relevans    faglighed    socialt    tidsforbrug
---------------  ----------  --------  ----------------  ----------  ----------  -----------  ---------  -------------
        5.1        12.2      3                 2.376       4.161          3.607     4.449      3.744           40.75
        4.5         9.5      2.985             2.415       4.17           3.595     4.405      3.805           42.5
        4.58136     8.77876  0.158465          0.114232    0.162785       0.301     0.177792   0.317497         7.2005


Unnamed: 0,Titel,frafald 1. år,ledighed,stress,eksamensstress,kvalitet,relevans,faglighed,socialt,tidsforbrug
3181,Arkitekt,4.0,11.0,2.74,2.1,4.13,3.53,4.46,4.12,45.0
1614,Veterinærmedicin,1.0,6.0,2.89,2.26,4.43,3.66,4.74,4.03,52.0
2593,Designledelse,10.0,27.0,2.97,2.38,4.21,3.29,4.53,3.76,35.0
1804,Odontologi,0.0,3.0,2.84,2.38,4.35,4.17,4.4,4.13,47.0
1699,Jura,1.0,5.0,3.21,2.41,4.09,3.73,4.37,3.43,33.0
1777,Farmaci,1.0,8.0,3.04,2.42,4.15,3.81,4.41,3.85,45.0
1252,Lyd- og musikteknologi,13.0,13.0,3.0,2.43,4.2,3.25,4.2,3.27,40.0
3211,Farmaceutisk videnskab,5.0,4.0,3.1,2.45,3.85,3.9,4.25,3.6,45.5
1100,Tværkulturelle studier,6.0,23.0,2.96,2.45,4.19,3.4,4.73,3.9,35.0
898,Amerikanske studier,10.0,22.0,3.25,2.48,4.01,3.33,4.4,3.35,30.0


In [14]:
print(tabulate([largest.mean(), largest.median(), largest.std()], headers=largest.mean().index))
largest

  frafald 1. år    ledighed    stress    eksamensstress    kvalitet    relevans    faglighed    socialt    tidsforbrug
---------------  ----------  --------  ----------------  ----------  ----------  -----------  ---------  -------------
        11.3       10.2      3.918             3.84        4.094        3.614       4.4        4.051          37.45
         6.5       10.5      3.97              3.73        4.08         3.555       4.35       3.97           37
         8.2334     4.61399  0.173192          0.366818    0.322773     0.25782     0.210608   0.265265        2.16603


Unnamed: 0,Titel,frafald 1. år,ledighed,stress,eksamensstress,kvalitet,relevans,faglighed,socialt,tidsforbrug
1172,Jordbrugsteknolog,17.0,10.0,3.84,3.62,4.08,3.65,4.43,3.96,38.0
1178,Journalist,6.0,11.0,3.63,3.62,4.08,3.58,4.22,4.22,37.0
1231,Geofysik og rumteknologi,4.0,11.0,3.85,3.62,4.56,3.93,4.76,4.18,41.0
23,Automationsteknolog,23.0,3.0,3.97,3.62,3.79,3.48,4.27,3.9,41.0
2391,Produktionsteknolog,18.0,10.0,3.99,3.72,3.87,3.36,4.24,3.84,38.0
2484,Kommunikation,6.0,8.0,3.97,3.74,4.22,3.69,4.43,3.82,35.0
1249,Landinspektørvidenskab,4.0,4.0,3.98,3.82,4.0,3.45,4.56,3.98,37.0
2526,Tv- og medietilrettelæggelse,4.0,14.0,3.67,3.86,4.13,3.53,4.24,4.14,37.0
28,Energiteknolog,24.0,12.0,4.17,3.95,3.58,3.32,4.16,3.8,35.5
386,Lærer fra Den frie Lærerskole,7.0,19.0,4.11,4.83,4.63,4.15,4.69,4.67,35.0


In [21]:
testing_for = "ledighed"

for term in data_løn.mean().index:
    nice_plot(testing_for, term, show_us=False)

Something that could be fun: 
- Frafald vs.
    - ledighed
    - stress
    - kvalitet
    - d


- ensom vs. stress
- kvalitet vs. relevans
- ensom vs. frafald

fig = px.scatter_matrix(data_løn)
fig.show()