# Exemplo de como usar BaseLayer

In [1]:
from baseLayer import base
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV

In [2]:
iris = datasets.load_iris()

### Exemplo usando simple função


In [3]:
def zero_guilded(X):
    X["ups"] = 0
    return X

In [4]:
zero_transform = base.SimpleLayerPipeline(function=zero_guilded)

### Testetando pipeline

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline

In [6]:
df = pd.read_csv('reddit_posts.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)
X = df[['gilded', 'num_comments', 'num_crossposts', 'ups']]
y = df['subreddit']

In [7]:
X

Unnamed: 0,gilded,num_comments,num_crossposts,ups
0,64,5117,10,139149
1,1,844,1,20255
2,1,4745,0,91308
3,0,1718,0,19882
4,2,1948,5,89392
...,...,...,...,...
1945,0,148,0,8945
1946,0,389,0,8941
1947,0,130,0,8942
1948,0,218,0,8946


In [8]:
y

0       1
1       0
2       1
3       0
4       1
       ..
1945    0
1946    0
1947    0
1948    0
1949    0
Name: subreddit, Length: 1950, dtype: int64

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [10]:
ss = StandardScaler()

# Fit scaler to training data
ss.fit(X_train)

# Transform train and test X data and save as new variables
X_train_scaled = ss.transform(X_train)
X_test_scaled = ss.transform(X_test)

# Instantiate our logistic regression object
logreg = LogisticRegression()

# Fit logistic regression to scaled training data
logreg.fit(X_train_scaled, y_train)

# Score logistic regression on scaled testing data
logreg.score(X_test_scaled, y_test)

0.8524590163934426

In [11]:
pipeline = Pipeline([
    ('ss', StandardScaler()), # tuple is (name, Transformer)
    ('logreg', LogisticRegression())
])

# Fit pipeline on training data
pipeline.fit(X_train, y_train)

# Score pipeline on testing data
pipeline.score(X_test, y_test)

0.8524590163934426

### Adicionando o soma

In [12]:
pipeline = Pipeline([
    ('soma',zero_transform),
    ('ss', StandardScaler()),# tuple is (name, Transformer)
    ('logreg', LogisticRegression())
])

# Fit pipeline on training data
pipeline.fit(X_train, y_train)

# Score pipeline on testing data
pipeline.score(X_test, y_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["ups"] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["ups"] = 0


0.6905737704918032

### Testando o gridsearch

In [13]:
def zero_column(X,column_name="gilded"):
    X[column_name] = 0
    return X

In [14]:
zero_transform = base.SimpleLayerPipeline(function=zero_column)

In [15]:
pipeline = Pipeline([
    ('zero',zero_transform),
    ('ss', StandardScaler()),# tuple is (name, Transformer)
    ('logreg', LogisticRegression())
])


In [16]:
parameters = {
    'zero__column_name': ('gilded', 'num_comments', 'num_crossposts', 'ups'),
}


In [17]:
gs = GridSearchCV(pipeline, parameters)

In [18]:
# Fit the grid search model to the training data
gs.fit(X_train, y_train)

# Score the grid search model with the testing data
gs.score(X_test, y_test)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column_name] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column_name] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column_name] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column_name] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column_name] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column_name] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

0.6823770491803278

In [19]:
gs.best_params_


{'zero__column_name': 'num_crossposts'}