# Exemplo extendendo a classe

In [1]:
from baseLayer import base
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('reddit_posts.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)
X = df[['gilded', 'num_comments', 'num_crossposts', 'ups']]
y = df['subreddit']

In [3]:
class ZeroTransform(base.LayerTransformSklearn):
    def _function_transform(self,X,column_name="ups"):
        X[column_name] = 0
        return X

In [4]:
zero_transform = ZeroTransform()

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [6]:
ss = StandardScaler()

# Fit scaler to training data
ss.fit(X_train)

# Transform train and test X data and save as new variables
X_train_scaled = ss.transform(X_train)
X_test_scaled = ss.transform(X_test)

# Instantiate our logistic regression object
logreg = LogisticRegression()

# Fit logistic regression to scaled training data
logreg.fit(X_train_scaled, y_train)

# Score logistic regression on scaled testing data
logreg.score(X_test_scaled, y_test)

0.8524590163934426

In [7]:
pipeline = Pipeline([
    ('ss', StandardScaler()), # tuple is (name, Transformer)
    ('logreg', LogisticRegression())
])

# Fit pipeline on training data
pipeline.fit(X_train, y_train)

# Score pipeline on testing data
pipeline.score(X_test, y_test)

0.8524590163934426

### Adicionado o novo transform no pipeline

In [8]:
pipeline = Pipeline([
    ('zero',zero_transform),
    ('ss', StandardScaler()),# tuple is (name, Transformer)
    ('logreg', LogisticRegression())
])

# Fit pipeline on training data
pipeline.fit(X_train, y_train)

# Score pipeline on testing data
pipeline.score(X_test, y_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column_name] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column_name] = 0


0.6905737704918032

### Testando com dois transform

In [9]:
class ZeroTransformColumn(base.LayerTransformSklearn):
    def _function_transform(self,X,column_name="ups",**paramets):
        X[column_name] = 0
        
        X[paramets['column']] = 0
        return X

In [10]:
zero2 = ZeroTransformColumn(column='ups')

In [11]:
pipeline2 = Pipeline([
    ('zero',zero_transform),
    ('zero2',zero2),
    ('ss', StandardScaler()),# tuple is (name, Transformer)
    ('logreg', LogisticRegression())
])

# Fit pipeline on training data
pipeline2.fit(X_train, y_train)

# Score pipeline on testing data
pipeline2.score(X_test, y_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column_name] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[paramets['column']] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column_name] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats i

0.6905737704918032

### Anulando duas colunas

In [12]:
pipeline2 = Pipeline([
    ('zero',zero_transform),
    ('zero2', ZeroTransformColumn(column='num_comments')),
    ('ss', StandardScaler()),# tuple is (name, Transformer)
    ('logreg', LogisticRegression())
])

# Fit pipeline on training data
pipeline2.fit(X_train, y_train)

# Score pipeline on testing data
pipeline2.score(X_test, y_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column_name] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[paramets['column']] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column_name] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats i

0.5409836065573771

### Fazendo teste de grid

In [13]:
parameters = {
    'zero__column_name': ('gilded', 'num_comments', 'num_crossposts', 'ups'),
}

In [14]:
gs = GridSearchCV(pipeline, parameters)

In [15]:
# Fit the grid search model to the training data
gs.fit(X_train, y_train)

# Score the grid search model with the testing data
gs.score(X_test, y_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column_name] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column_name] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column_name] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column_name] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column_name] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column_name] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

0.5409836065573771

In [16]:
gs.best_params_

{'zero__column_name': 'gilded'}