In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.linear_model import LinearRegression
import pickle

In [2]:
f, t = load_iris(return_X_y=True)

In [3]:
dataset = np.hstack((f, t.reshape(-1,1)))

In [4]:
columns = [
    "sepal length (cm)",
    "sepal width (cm)",
    "petal length (cm)",
    "petal width (cm)",
    "category"
]

In [5]:
dataset = pd.DataFrame(dataset, columns=columns)

In [6]:
dataset.sample(3)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),category
78,6.0,2.9,4.5,1.5,1.0
28,5.2,3.4,1.4,0.2,0.0
8,4.4,2.9,1.4,0.2,0.0


In [7]:
target, features = dataset["sepal length (cm)"], dataset.drop("sepal length (cm)", axis=1)

##### TrainTestSplit

In [8]:
trainX, testX, trainy, testy = train_test_split(features, target, random_state=42, test_size=0.25)

#### Let's try building a simple Linear Model out of this ragged dataset

In [9]:
linear_model = LinearRegression().fit(trainX, trainy)

In [10]:
linear_model.score(testX, testy)

0.8644174403751941

##### A simple Linear model perform so well without a preprocessing.
let's save this model before moving foreword

In [11]:
model_name = "models/iris_model.sav"
pickle.dump(linear_model, open(model_name, 'wb'))

In [12]:
from sklearn.preprocessing import MinMaxScaler
### Essayons de standiser les donnees pour voir a quoi cela va ressembler
featuresScaled = MinMaxScaler().fit_transform(features)
featuresScaled = pd.DataFrame(featuresScaled)
featuresScaled.describe()
featuresScaled.drop(3, axis=1, inplace=True)

In [13]:
trainXScaled, testXScaled, trainyScaled, testyScaled = \
     train_test_split(featuresScaled, target, random_state=42, test_size=0.25)

In [14]:
linear_model = LinearRegression().fit(trainXScaled, trainyScaled)
linear_model.score(testXScaled, testyScaled)

0.8613069732650587

In [15]:
features

Unnamed: 0,sepal width (cm),petal length (cm),petal width (cm),category
0,3.5,1.4,0.2,0.0
1,3.0,1.4,0.2,0.0
2,3.2,1.3,0.2,0.0
3,3.1,1.5,0.2,0.0
4,3.6,1.4,0.2,0.0
...,...,...,...,...
145,3.0,5.2,2.3,2.0
146,2.5,5.0,1.9,2.0
147,3.0,5.2,2.0,2.0
148,3.4,5.4,2.3,2.0


In [16]:
features_processed = features.drop("category", axis=1)
X_train_pro, X_test_pro, y_train_pro, y_test_pro = train_test_split(features_processed, target, test_size=0.25)

In [17]:
linear_model.fit(X_train_pro, y_train_pro)

LinearRegression()

In [18]:
linear_model.score(X_test_pro, y_test_pro)

0.8341379983934725

In [19]:
model_name = "models/iris_final_model.pkl"
pickle.dump(linear_model, open(model_name, 'wb'))

#### Let's try a svm and a bunch of other models

In [40]:
from sklearn.svm import SVR
svm = SVR()
# svm.fit(X_train_pro, y_train_pro)
svm.fit(trainXScaled, trainyScaled)


SVR()

In [41]:
svm.score(testXScaled, testyScaled)

0.830665538889931