# Exemplo extendendo a classe

In [1]:
from baseLayer import base
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('reddit_posts.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)
X = df[['gilded', 'num_comments', 'num_crossposts', 'ups']]
y = df['subreddit']

In [3]:
len(X[X["ups"] < 5])

0

In [4]:
len(X[X["gilded"] < 5])

1947

In [5]:
class ZeroTransform(base.LayerFitTransformSklearn):
    def _function_fit(self,X,y,limir=2000):
        response = list()
        for column in X.columns:
            if len(X[X[column] < 5]) < limir:
                response.append(column)
        return {"column_name":response}
    
    def _function_transform(self,X,column_name):
        for column in column_name:
            X[column_name] = 0
        return X

In [6]:
zero_transform = ZeroTransform()

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [8]:
ss = StandardScaler()

# Fit scaler to training data
ss.fit(X_train)

# Transform train and test X data and save as new variables
X_train_scaled = ss.transform(X_train)
X_test_scaled = ss.transform(X_test)

# Instantiate our logistic regression object
logreg = LogisticRegression()

# Fit logistic regression to scaled training data
logreg.fit(X_train_scaled, y_train)

# Score logistic regression on scaled testing data
logreg.score(X_test_scaled, y_test)

0.8524590163934426

In [9]:
pipeline = Pipeline([
    ('ss', StandardScaler()), # tuple is (name, Transformer)
    ('logreg', LogisticRegression())
])

# Fit pipeline on training data
pipeline.fit(X_train, y_train)

# Score pipeline on testing data
pipeline.score(X_test, y_test)

0.8524590163934426

### Adicionado o novo transform no pipeline

In [10]:
pipeline = Pipeline([
    ('zero',zero_transform),
    ('logreg', LogisticRegression())
])

# Fit pipeline on training data
pipeline.fit(X_train, y_train)

# Score pipeline on testing data
pipeline.score(X_test, y_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column_name] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-do

0.47540983606557374

### Fazendo teste de grid

In [11]:
parameters = {
    'zero__limir': (0,500,1000,2000,3000),
}

In [12]:
gs = GridSearchCV(pipeline, parameters)

In [13]:
# Fit the grid search model to the training data
gs.fit(X_train, y_train)

# Score the grid search model with the testing data
gs.score(X_test, y_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column_name] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-do

0.47540983606557374

In [17]:
gs.predict(X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column_name] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [14]:
gs.best_params_

{'zero__limir': 0}

### Fazendo pickle do modelo

Verifique o arquivo LoadGridExample

In [15]:
import pickle

In [16]:
with open("grid_pickle_test", 'wb') as handle:
    pickle.dump(gs, handle, protocol=pickle.HIGHEST_PROTOCOL)