> `modelling_quant(data, be_predicted, to_predict)` apply regression on the chosen data and returns some results associated. It also automates the encoding of qualitative variables

In [None]:
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import streamlit as st

def modelling_quant(data, be_predicted, to_predict):
    
    #define an encoder for qualitative variables
    encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
        
    #drop observations where y contains NAs
    #this step isn't facultative, which means that a Y with lots of
    #NA will result in a poor model
    y = be_predicted
    df = data[data[y].notna()] 
    Y = df[y]
    
    #makes a list of quantitative and qualitative variables 
    features = to_predict
    qual_feat = []
    quant_feat = []
    qual_index = []
    for var in features:
        if df[var].dtype == "object":
            qual_feat.append(var)
            qual_index.append(True)
        else:
            quant_feat.append(var)
            qual_index.append(False)
    
    #encode the X matrix
    encoder.fit(df[qual_feat])
    X_qual = encoder.transform(df[qual_feat])
    X = df[quant_feat].join(pd.DataFrame(X_qual))

    #define and fit the model
    model = HistGradientBoostingRegressor()
    reg = model.fit(X,Y)
    
    #print R2
    st.write("Shape of X (after one hot encoding): ", X.shape)
    st.write("Shape of Y (after removing NAs):", Y.shape)
    R_2 = round(reg.score(X,Y),6)
    st.write("Determination coefficient: ", R_2)

<br>
<br>

> `modelling_quant(data, be_predicted, to_predict)` is a very similar function but whose predicted variable is a qualitative

In [9]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import streamlit as st

def modelling_qual(data, be_predicted, to_predict):
    
    #define an encoder for qualitative variables
    encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
        
    #drop observations where y contains NAs
    #this step isn't facultative, which means that a Y with lots of
    #NA will result in a poor model
    y = be_predicted
    df = data[data[y].notna()] 
    Y = df[y]
    
    #makes a list of quantitative and qualitative variables 
    features = to_predict
    qual_feat = []
    quant_feat = []
    qual_index = []
    for var in features:
        if df[var].dtype == "object":
            qual_feat.append(var)
            qual_index.append(True)
        else:
            quant_feat.append(var)
            qual_index.append(False)
    
    #encode the X matrix
    encoder.fit(df[qual_feat])
    X_qual = encoder.transform(df[qual_feat])
    X = df[quant_feat].join(pd.DataFrame(X_qual))

    #define and fit the model
    model = HistGradientBoostingClassifier()
    reg = model.fit(X,Y)
    
    #print accuracy
    st.write("Shape of X (after one hot encoding): ", X.shape)
    st.write("Shape of Y (after removing NAs):", Y.shape)
    accuracy = round(reg.score(X,Y), 6)
    st.write("Mean accuracy: ", accuracy)