In [41]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Perceptron
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
from matplotlib import rcParams
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score, f1_score, precision_score, recall_score
rcParams["figure.figsize"] = 10, 5
%matplotlib inline

In [22]:
def GetOccupancyData(filename):
    occupancy = pd.read_csv(filename)
    occupancy['date'] = pd.to_datetime(occupancy['date'])
    occupancy['date_day'] = occupancy['date'].dt.day
    occupancy['date_hour'] = occupancy['date'].dt.hour
    occupancy = occupancy.drop('date', axis=1)
    swap_list = ["date_day","date_hour","Temperature", "Humidity", "Light", "CO2", "HumidityRatio","Occupancy"]
    occupancy = occupancy.reindex(columns=swap_list)
    y = occupancy.iloc[:,7]
    X = occupancy.iloc[:,:7]
    return X,y

In [31]:
def GetBanknoteData():
    banknote = pd.read_csv("data_banknote_authentication.txt")
    y = banknote.iloc[:,4]
    X = banknote.iloc[:,:4]
    return X,y

In [42]:
class MLProblem:
    def __init__(self, performance_measure, train_size, validation_size, test_size):
        self.performance_measure = performance_measure
        self.train_size = train_size
        self.validation_size = validation_size
        self.test_size = test_size
        
    def PerformanceMeasure(self, y_true, y_pred):
        if self.performance_measure == 'r2_score':
            return r2_score(y_true, y_pred)
        elif self.performance_measure == 'f1_score':
            return f1_score(y_true, y_pred)
        elif self.performance_measure == 'precision_score':
            return precision_score(y_true, y_pred)
        elif self.performance_measure == 'recall_score':
            return recall_score(y_true, y_pred)
        
    def SplitData(self, X, y):
        # Now we'll use 'train_test_split' from sklearn
        # to split the data into training and testing sets
        random_state = 0
        self.validation_size = self.validation_size / (1-self.test_size)
        
        # 'train_test_split' convinience function
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_size, random_state=random_state)
        X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=self.validation_size, random_state=random_state)
        return X_train,y_train,X_validation,y_validation,X_test,y_test
    
    def StandarizeData(self, sc, X):
        X_std = sc.transform(X)
        return X_std;
    
    def TrainModels(self, models, X_train, y_train):
        for model in models:
            model.fit(X_train, y_train)
    
    def SelectModel(self, models, X_test, y_test):
        best_model = models[0]
        best_score = -1
        for model in models:
            y_pred = model.predict(X_test)
            score = self.PerformanceMeasure(y_test, y_pred)
            if score > best_score:
                best_model = model
                best_score = score
        return best_model, best_score
    
    def ValidateModel(self, model, X_validation, y_validation):
        y_pred = model.predict(X_validation)
        validation_measure = self.PerformanceMeasure(y_validation, y_pred)
        return validation_measure
    
    def Solve(self, X, y): 
        X_train,y_train,X_validation,y_validation,X_test,y_test = self.SplitData(X, y)
        sc = StandardScaler()
        sc.fit(X_train)
        
        X_train_std = self.StandarizeData(sc, X_train)
        X_validation_std = self.StandarizeData(sc, X_validation)
        X_test_std = self.StandarizeData(sc, X_test)
        
        lgr = LogisticRegression()
        svc = SVC(kernel='linear', C=0.5)
        models = [lgr, svc]
        self.TrainModels(models, X_train_std, y_train)
        best_model, model_score = self.SelectModel(models, X_validation_std, y_validation)
        y_pred = best_model.predict(X_test_std)
        return best_model, self.PerformanceMeasure(y_test, y_pred)

In [43]:
def SolveOccupancyProblem(performance_measure, train_size, validation_size, test_size):
    X,y = GetOccupancyData("datatraining.txt")
    X_2,y_2 = GetOccupancyData("datatest.txt")
    X_3, y_3 = GetOccupancyData("datatest2.txt")
    X_frames = [X,X_2,X_3]
    y_frames = [y,y_2,y_3]
    X = pd.concat(X_frames)
    y = pd.concat(y_frames)
    occupancy_problem = MLProblem(performance_measure, train_size, validation_size, test_size)
    return occupancy_problem.Solve(X,y)

In [44]:
def SolveBanknoteProblem(performance_measure, train_size, validation_size, test_size):
    X,y = GetBanknoteData()
    banknote_problem = MLProblem(performance_measure, train_size, validation_size, test_size)
    return banknote_problem.Solve(X,y)

Some examples:

In [26]:
SolveOccupancyProblem('r2_score', 0.7, 0.2, 0.1)

(SVC(C=0.5, kernel='linear'), 0.9338221196570192)

In [33]:
SolveBanknoteProblem('r2_score', 0.7, 0.2, 0.1)

(LogisticRegression(), 0.8512931034482758)

In [34]:
SolveOccupancyProblem('f1_score', 0.7, 0.2, 0.1)

(SVC(C=0.5, kernel='linear'), 0.9745293466223698)

In [35]:
SolveBanknoteProblem('f1_score',0.7,0.2,0.1)

(LogisticRegression(), 0.9586776859504132)

# Banknote problem

For the banknote problem, we assume this ML model will be used in an ATM. The banknotes inserted into the ATM may not be of the same quality as the ones used for training the model, therefore we need to use just 70% percent of our data to train the model, so that it can recognize folded and old banknotes. The remaining data will be used for validate and test our ML model, so they will have 20% and 10% percent of the data respectively. The metric used compare the performance across different ML models will be `precision_score()` because we want our model to not fail in the false positives rather than not fail in the true positives banknotes. This is because, is more important to reject a false banknote than accept a true one.

In [39]:
SolveBanknoteProblem('precision_score', 0.7,0.2,0.1)

(LogisticRegression(), 0.9206349206349206)

# Occupancy problem

For the occupancy problem, we assume this ML model will be used in a university classrooms. The amount of students that use the classroom in each class will vary each term so we will use just 60% of the data to train our model, as we don't want it to learn the specific amount of people present in the classroom in different hours. The remaining data will be used to validate and test our model, then they will have both 20% of the data. As it will be used for university classrooms, there will be certain specific hours where it will be used, hence we need just the date and hour in our dataset to train the model. Finally, the metric used to compare the performance across different models will be `recall_score()` because we want our model to not fail in the true positives samples rather than the false positives ones, as it's more important to turn on the lights when it's needed than when it's not.

In [45]:
SolveOccupancyProblem('recall_score', 0.6, 0.2, 0.2)

(LogisticRegression(), 0.9958027282266527)

# Validación cruzada

In [56]:
percentages = [[0.8, 0.1, 0.1], [0.65,0.2,0.15], [0.8, 0.15,0.05], [0.9, 0.05, 0.05], [0.6,0.2,0.2], [0.6,0.3,0.1], [0.6,0.1,0.3]]

In [59]:
banknote_accuracy = []
occupancy_accuracy = []
for p in percentages:
    banknote_accuracy.append(SolveBanknoteProblem('precision_score', p[0], p[1], p[2])[1])
    occupancy_accuracy.append(SolveOccupancyProblem('recall_score', p[0], p[1], p[2])[1])
    
print("ML for banknote problem average result: ", np.average(banknote_accuracy))
print("ML for occupancy problem average result: ", np.average(occupancy_accuracy))

ML for banknote problem average result:  0.9285410657163905
ML for occupancy problem average result:  0.9935472497002144
