In [None]:
!pip install pandas
!pip install numpy
!pip install sklearn

In [1]:
import pandas as pd 
import numpy as np

#Preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

#Modello
from sklearn.tree import DecisionTreeClassifier

#Metrica
from sklearn.metrics import accuracy_score

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
#Feature engineering con le feature che rappresentano date
def date_features(df):
    df['ScheduledDay'] = pd.to_datetime(df['ScheduledDay'], errors='coerce')
    df['AppointmentDay'] = pd.to_datetime(df['AppointmentDay'], errors='coerce')
    df['Appointment_day_of_week'] = df.AppointmentDay.dt.dayofweek
    df['Appointment_day'] = df.AppointmentDay.dt.day
    df['Appointment_month'] = df.AppointmentDay.dt.month
    df['Appointment_year'] = df.AppointmentDay.dt.year
    df['Appointment_awaiting'] = df.AppointmentDay.dt.dayofyear - df.ScheduledDay.dt.dayofyear

In [4]:
#Encoder
def feature_encoder(df):  
    cat_features = ['No-show', 'Gender', 'Appointment_year']
    label_encoder = LabelEncoder()
    for feature in cat_features:
        df[feature] = label_encoder.fit_transform(df[feature])

In [5]:
#Imputer
def feature_imputer(df):
    cat_features = df.select_dtypes(include=['object','bool']).columns.tolist()
    num_features = df.select_dtypes(include=['int64','float64']).columns.tolist()

    imputer_num = Pipeline(steps=[
        ('imp_num', SimpleImputer())
    ])

    imputer_cat = Pipeline(steps=[
        ('imp_cat', SimpleImputer(strategy="most_frequent"))
    ])

    feature_imputer = ColumnTransformer(
        remainder='passthrough',
        transformers=[
            ('imp_num', imputer_num, num_features),
            ('imp_cat', imputer_cat, cat_features)
])

In [6]:
#Preprocessing dati di train
train_pipe = Pipeline([
    ('date', date_features(train_data)),
    ('encoder', feature_encoder(train_data)),
    ('imputer', feature_imputer(train_data)),
])

In [7]:
#Preprocessing dati di test
test_pipe = Pipeline([
    ('date', date_features(test_data)),
    ('encoder', feature_encoder(test_data)),
    ('imputer', feature_imputer(test_data)),
])

In [8]:
#Colonne inutili ai fini del training e da eliminare
cols_to_drop = ['No-show','PatientId', 'AppointmentID', 'AppointmentDay', 'ScheduledDay', 'Age', 'Neighbourhood']

In [9]:
#Inizializzazione dati di training e test
X_train = train_data.drop(cols_to_drop, axis=1)
X_test = test_data.drop(cols_to_drop, axis=1)
y_train = pd.DataFrame(train_data['No-show'])
y_test = pd.DataFrame(test_data['No-show'])

In [20]:
#Instanziazzione del DecisionTreeClassifier; utilizza una foresta di alberi di decisione (100 default), con profondità massima 10 e che utilizza
#l'information gain come criterio
decision_tree = DecisionTreeClassifier(random_state=42,max_depth=10,criterion='entropy')

In [21]:
decision_tree.fit(X_train, y_train.values.ravel())

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=10, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')

In [23]:
train_prediction = decision_tree.predict(X_train)
test_prediction = decision_tree.predict(X_test)

accuracy = accuracy_score(y_test, test_prediction)
print('Decision Tree accuracy score: {:.3f}'.format(accuracy))

Decision Tree accuracy score: 0.800
