In [1]:
'''Read the data '''
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy 

def data(): 
    # load data 
    training = pd.read_csv("/kaggle/input/dat1582/training_data.csv")
    # load data 
    test = pd.read_csv("/kaggle/input/dat1582/test_data.csv")
    all_data = pd.concat(map(pd.read_csv, ["/kaggle/input/dat1582/training_data.csv", "/kaggle/input/dat1582/training_data.csv"]))


    train_data= training.sort_values(by="id")
    test_data = test.sort_values(by="id")
    all_data = all_data.sort_values(by="id")
    
    return train_data, test_data, all_data

train_data, test_data, all_data = data() 

In [2]:
'''Transform data with pipeline'''
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler


def preprop(): 
    num_attribs = ["rcount","hematocrit","neutrophils","sodium","glucose",
              "bloodureanitro","creatinine","secondarydiagnosisnonicd9",
              "respiration","neutrophils","bmi","pulse"]

    cat_attribs = ["facid","dialysisrenalendstage","asthma",
               "irondef","pneum","substancedependence",
              "psychologicaldisordermajor","depress",
              "psychother","fibrosisandother",
              "malnutrition","hemo","gender"]

    num_pipeline = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())


    cat_pipeline = make_pipeline(
        SimpleImputer(strategy="most_frequent"),
        OneHotEncoder(handle_unknown="ignore"))

    preprocessing = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", cat_pipeline, cat_attribs),
    ])
preprocessing = preprop() 

In [None]:
"""Linear regression - not a good performance"""
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

class lin_reg(): 
    def lin_reg():
        pass 
    def train(self,data):
        preprocessing,labels,data = transform_data(data)
        lin_reg = make_pipeline(preprocessing,LinearRegression())

        print("training with length:", len(data))
        lin_reg.fit(data,labels)
        return lin_reg 
    
    def predict(self,data,lin_reg):
        print("predict with length + ",len(data))
        pred = lin_reg.predict(data)
        return pred 

    #write to csv 
    def to_csv(self,data,pred): 
        preds = pred 
        ids = []
        for index, row in data.iterrows():
            for index, value in row.items():
                if(index == "id"):
                    ids.append(value)



        df = pd.DataFrame({'id': ids,
                           'lengthofstay': preds})
        import os 
        if(not os.path.exists("./output")):
            os.makedirs("./output")
        df.to_csv("./output/linear_reg2.csv",index=False)



    def validate(self,data,lin_reg):
        labels = data["lengthofstay"].copy() 
        print("validate with length ", len(data))
        pred = lin_reg.predict(data)
        print(mean_squared_error(labels, pred,squared=False))
        print(r2_score(labels, pred)) 
    

    
#my_lin_reg = lin_reg() 

#lin_reg = my_lin_reg.train(all_data)
#my_lin_reg.validate(training,lin_reg)
#pred = my_lin_reg.predict(test,lin_reg)
#to_csv(test,pred)

In [None]:
'''Random Forest '''
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

import matplotlib.pyplot as plt
import joblib 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score



class random_forrest: 
    def __init__(self,preprocessing):
        self.model = None 
        self.preprocessing = preprocessing 
        
    def load_model(self,path):
        self.model = joblib.load(path)
        
        
    def train(self,train_data): 
        training_data = train_data.drop("lengthofstay",axis=1)
        training_labels = train_data["lengthofstay"].copy()

        full_pipeline = Pipeline([
            ("preprocessing", self.preprocessing),
            ("random_forest", RandomForestRegressor(random_state=42)),
        ])
        #bisher 10 max features am besten 
        param_grid = [
            {'random_forest__max_features': [16]            }

        ]
        self.model = GridSearchCV(full_pipeline, param_grid, cv=2,
                                   scoring='neg_root_mean_squared_error',verbose=2)
        self.model.fit(training_data, training_labels)
    
    

    def measure(self,data):
        print("measuring...")
        X = train_data.drop("lengthofstay",axis=1)
        y = train_data["lengthofstay"].copy()
        
        pred = self.model.predict(X)
        
        print("mean squared: " , mean_squared_error(y, pred,squared=False))
        print("r2: ", r2_score(y, pred)) 
        self.compare(data,pred)
        
        
    def compare(self,data,pred): 
        """Look at the predictions compared to the labels"""
        training_data = data.drop("lengthofstay",axis=1)
        labels = data["lengthofstay"].copy()
        x = data["id"].copy() 
    
        
        x = x[2000:2100]
        pred = pred[2000:2100]
        labels = labels[2000:2100]
        
        plt.scatter(x, pred, color="red",alpha=0.5)
        plt.scatter(x,labels, color="blue",alpha=0.5)
        # Achsenbeschriftungen hinzufügen
        plt.xlabel('X-Koordinaten')
        plt.ylabel('Y-Koordinaten')

        # Titel hinzufügen
        plt.title('Streudiagramm')

        # Diagramm anzeigen
        plt.show()
        
    def to_csv(self,data,name="rf"): 
        
        pred = self.model.predict(data)
        preds = []
        
        for e in pred:
            preds.append(round(e))
        ids = []
        for index, row in data.iterrows():
            for index, value in row.items():
                if(index == "id"):
                    ids.append(value)

        

        df = pd.DataFrame({'id': ids,
                           'lengthofstay': preds})
        import os 
        if(not os.path.exists("./output")):
            os.makedirs("./output")
        df.to_csv("./output/"+name+".csv",index=False)


Best config: 
{'random_forest__max_depth': 36, 'random_forest__max_features': 16, 'random_forest__min_samples_leaf': 1} 
Result 5 folds:  
mean squared:  0.24090335119740258  
r2:  0.9895520308661191  

Result 2 Folds:  
mean squared:  0.24090335119740258  
r2:  0.9895520308661191

Result 10 Folds: 
mean squared:  0.24090335119740258
r2:  0.9895520308661191


In [None]:
import joblib
import os 
def train_and_save(): 
    rf = random_forrest(preprocessing) # 3 gride mit cv 2 = 0.24126; und 0.98952
    rf.train(train_data)
    rf.measure(train_data)
    rf.to_csv(test,"final grid")

    if(not os.path.exists("./models")):
        os.makedirs("./models")
    joblib.dump(rf.model, "models/final.pkl")

In [None]:
def printModelInfo(): 
    cv_res = pd.DataFrame(rf.model.cv_results_)
    cv_res.sort_values(by="mean_test_score", ascending=False, inplace=True)
    print(cv_res)
    print(rf.model.best_params_)

In [None]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score


class ensemble:
    def __init__(self,prep):
        self.pipeline = None 
        self.preprocessing = prep
    def train(self,data):
        print("training...")
        X = train_data.drop("lengthofstay",axis=1)
        y = train_data["lengthofstay"].copy()
        
        voting_clf = VotingClassifier(
                 estimators=[
                 #('lr', LogisticRegression(random_state=42,max_iter=1000)),
                 ('rf', RandomForestClassifier(random_state=42)),
                 ('svc', SVC(random_state=42))
                 ]
        )
        
        self.pipeline = Pipeline([
                ('prep', self.preprocessing),
                ('voting_classifier', voting_clf)
            ])
        
        self.pipeline.fit(X,y)
    def measure(self,data):
        print("measuring...")
        X = train_data.drop("lengthofstay",axis=1)
        y = train_data["lengthofstay"].copy()
        
        pred = self.pipeline.predict(X)
        
        print("mean squared: " , mean_squared_error(y, pred,squared=False))
        print("r2: ", r2_score(y, pred)) 
        
        
        
    def compare(self,data): 
        print("comparing....")
        """Look at the predictions compared to the labels"""
        training_data = data.drop("lengthofstay",axis=1)
        labels = data["lengthofstay"].copy()
        x = data["id"].copy() 
        pred = self.pipeline.predict(training_data)
        
        
        x = x[2000:2100]
        pred = pred[2000:2100]
        labels = labels[2000:2100]
        
        plt.scatter(x, pred, color="red",alpha=0.5)
        plt.scatter(x,labels, color="blue",alpha=0.5)
        # Achsenbeschriftungen hinzufügen
        plt.xlabel('X-Koordinaten')
        plt.ylabel('Y-Koordinaten')

        # Titel hinzufügen
        plt.title('Streudiagramm')

        # Diagramm anzeigen
        plt.show()
        
        
    def to_csv(self,data,name="ensemble"): 
        
        pred = self.pipeline.predict(data)
        preds = []
        
        for e in pred:
            preds.append(round(e))
        ids = []
        for index, row in data.iterrows():
            for index, value in row.items():
                if(index == "id"):
                    ids.append(value)

        

        df = pd.DataFrame({'id': ids,
                           'lengthofstay': preds})
        import os 
        if(not os.path.exists("./output")):
            os.makedirs("./output")
        df.to_csv("./output/"+name+".csv",index=False)

In [None]:
#e = ensemble(preprocessing)
#e.train(train_data)
#e.measure(train_data)
#e.compare(train_data)

In [3]:
#install forever
!pip install gradio --target=/kaggle/working/mysitepackages


Collecting gradio
  Downloading gradio-3.44.4-py3-none-any.whl (20.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.2/20.2 MB[0m [31m51.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)
Collecting altair<6.0,>=4.2.0 (from gradio)
  Downloading altair-5.1.1-py3-none-any.whl (520 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m520.6/520.6 kB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fastapi (from gradio)
  Downloading fastapi-0.103.1-py3-none-any.whl (66 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.2/66.2 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpy (from gradio)
  Downloading ffmpy-0.3.1.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting gradio-client==0.5.1 (from gradio)
  Downloading gradio_client-0.5.1-py3-none-any.whl (298 kB)
[2K     [

In [5]:
#load model and install gradio 
def load_model(): 
    import joblib
    return joblib.load("/kaggle/input/final-random-forrest/final.pkl")
model = load_model()

#%pip install gradio > /dev/null
# add to system path
import sys
sys.path.append('/kaggle/working/mysitepackages')

import gradio as gr 

In [6]:
 

import pandas as pd
train_data = pd.read_csv("/kaggle/input/dat1582/training_data.csv")
def predict(bmi,sex,asthma,iron,creat):
    



    
    standard_data = train_data.iloc[[0]]
    standard_data.iat[0,standard_data.columns.get_loc("bmi")] = bmi
    standard_data.iat[0,standard_data.columns.get_loc("gender")] = "M"
    standard_data.iat[0,standard_data.columns.get_loc("asthma")] = asthma
    standard_data.iat[0,standard_data.columns.get_loc("irondef")] = iron
    standard_data.iat[0,standard_data.columns.get_loc("creatinine")] = creat
    
    pred = model.predict(standard_data)
    ergebnis = int(float(pred[0]))

    return ergebnis
  



predict(22.5,"M",False,False,0.5)

9

In [7]:
# doesnt work in kaggle but works in spaces: https://huggingface.co/spaces/JonasFeierabend/DAT158


# Set the minimum, maximum, and default values for the sliders

bmi_min, bmi_max, bmi_default = 15, 50, 25


# Create the interface
iface = gr.Interface(
    fn=predict, 
    inputs=[
        gr.components.Slider(minimum=bmi_min, maximum=bmi_max, value=bmi_default, label="BMI"),
        gr.components.Dropdown(choices=["M", "F"],label="Geschlecht",value="M"),
        gr.components.Checkbox(label="Asthma"),
        gr.components.Checkbox(label="Irondefizit"),
        gr.components.Slider(minimum=0.1, maximum=2.5, value = 0.5, label="Creatine")
    ], 
    outputs=gr.components.Textbox(label="Prediction"),
    title="Diabetes Predictor",
    description="""Enter your age, BMI, and glucose level to predict whether you are diabetic or non-diabetic.
    Data source: Pima Indians Diabetes Database; Model: Random Forest Classifier""",
)

# Launch the interface
iface.launch(share=True)


Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://6945474e46d8079325.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


