## Using of partial_fit()

Our Goal is to find the model that can incorporate new training set data without retraining the entire model

In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import preprocessing
from sklearn import metrics
import seaborn as sns
import random

from os import listdir
from os.path import isfile, join

from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

import csv
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

import pickle

In [6]:
# SGDClassifier had partial_fit() function. 
# SGDClassifier can work as Logistic Regression and as SVM - it depends on "loss" parametr.
from sklearn.linear_model import SGDClassifier

In [2]:
## training data:
df_work = pd.read_csv('labels_and_features_20.csv')

Let's pretend that initially we have only a part of our dataset, and then we get some additional data.

In [14]:
init_data = df_work.iloc[ : 1700]           # 1700 samples
additional_data = df_work.iloc[1700: 1800]  # 100 samples
test_data = df_work.iloc[1800: ]            # 197 smaples

In [15]:
fs = ['q_Imax', 'Imax_over_Imean', 'Imax_over_Imean_local',
       'fluctuation_strength', 'low_q_ratio', 'high_q_ratio',
       'Imax_over_Ilowq', 'Imax_over_Ihighq', 'Ilowq_over_Ihighq']

In [39]:
scaler = preprocessing.StandardScaler()
scaler.fit(init_data[fs])
logreg = SGDClassifier(loss= 'log', random_state = 101)
# train the model on the initial data
logreg.fit(scaler.transform(init_data[fs]), init_data['bad_data'])
print('The model after initial training:', logreg.coef_ )
scores = logreg.score(scaler.transform(test_data[fs]), test_data['bad_data'])
print('Accuracy on testing set')
print(scores)
print('\n')

# to save the model
with open('my_dumped_classifier.pkl', 'wb') as fid:
    pickle.dump(logreg, fid)  

# load the model
with open('my_dumped_classifier.pkl', 'rb') as fid:
    logreg2 = pickle.load(fid)

# additional training with paritial_fit() ###################################################################
logreg2.partial_fit(scaler.transform(additional_data[fs]), additional_data['bad_data'], classes=[True, False])
#############################################################################################################

print('The model after additional training:', logreg2.coef_ )
scores = logreg2.score(scaler.transform(test_data[fs]), test_data['bad_data'])
print('Accuracy on testing set')
print(scores)

The model after initial training: [[ -3.94161337 -11.0369695   -0.63127216   4.37763106  -8.30183662
    8.30183662   7.29089711   2.24619321  -4.13462304]]
Accuracy on testing set
0.989847715736


The model after additional training: [[ -3.71695557 -11.08426576  -1.57611988   3.69125606  -8.52898932
    8.52898932   5.51380811   3.03528943  -2.92655993]]
Accuracy on testing set
0.994923857868
