# Simple Pipeline with Skikit-Learn

In [112]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [113]:
df1 = {'Social_media_followers':
      [1000000, np.nan, 2000000, 1310000, 1700000, np.nan,
       4100000,1600000,2200000,2340000],
'Sold_out':[1,0,0,1,0,0,0,1,0,1]}

In [114]:
df1 =pd.DataFrame(data=df1)
df1

Unnamed: 0,Social_media_followers,Sold_out
0,1000000.0,1
1,,0
2,2000000.0,0
3,1310000.0,1
4,1700000.0,0
5,,0
6,4100000.0,0
7,1600000.0,1
8,2200000.0,0
9,2340000.0,1


In [115]:
X1 = df1[['Social_media_followers']]
X1

Unnamed: 0,Social_media_followers
0,1000000.0
1,
2,2000000.0
3,1310000.0
4,1700000.0
5,
6,4100000.0
7,1600000.0
8,2200000.0
9,2340000.0


In [116]:
y1 = df1[["Sold_out"]]
y1

Unnamed: 0,Sold_out
0,1
1,0
2,0
3,1
4,0
5,0
6,0
7,1
8,0
9,1


In [117]:
from sklearn.model_selection import train_test_split

In [118]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, 
                                      test_size = 0.3, random_state= 42)

In [119]:
X1_train.shape

(7, 1)

In [120]:
X1_test.shape

(3, 1)

In [121]:
from sklearn. impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')

In [122]:
from sklearn. linear_model import LogisticRegression

lr = LogisticRegression()

In [123]:
from sklearn.pipeline import make_pipeline

In [124]:
pipe1 = make_pipeline(imputer,lr)

In [125]:
pipe1.fit(X1_train,y1_train)

In [126]:
pipe1.score(X1_train,y1_train)

0.5714285714285714

In [127]:
pipe1.score(X1_test,y1_test)

0.3333333333333333

In [128]:
pipe1.named_steps.simpleimputer.statistics_

array([2007142.85714286])

In [129]:
pipe1.named_steps.logisticregression.coef_

array([[-1.87962018e-06]])

# Advanced Pipeline

In [130]:
d2 = {'Genre': ['Rock', 'Metal', 'Bluegrass', 'Rock', np.nan, 'Rock', 
                'Rock', np.nan,'Metal','Bluegrass'],
'Social_media_followers':[1000000, np.nan, 2000000, 1310000, 1700000, 
                          np.nan, 2300000,3200000,4100000,1900000],
'Sold_out':[1,0,0,1,0,0,0,1,0,1]}

In [131]:
df = pd.DataFrame(data=d2)
df

Unnamed: 0,Genre,Social_media_followers,Sold_out
0,Rock,1000000.0,1
1,Metal,,0
2,Bluegrass,2000000.0,0
3,Rock,1310000.0,1
4,,1700000.0,0
5,Rock,,0
6,Rock,2300000.0,0
7,,3200000.0,1
8,Metal,4100000.0,0
9,Bluegrass,1900000.0,1


In [132]:
X = df.iloc[:, 0:2]
X

Unnamed: 0,Genre,Social_media_followers
0,Rock,1000000.0
1,Metal,
2,Bluegrass,2000000.0
3,Rock,1310000.0
4,,1700000.0
5,Rock,
6,Rock,2300000.0
7,,3200000.0
8,Metal,4100000.0
9,Bluegrass,1900000.0


In [133]:
y = df.iloc[:, 2]
y

0    1
1    0
2    0
3    1
4    0
5    0
6    0
7    1
8    0
9    1
Name: Sold_out, dtype: int64

In [134]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3,
                                                    random_state =23)

In [135]:
num_cols = ['Social_media_followers']

cat_cols = ['Genre']

In [136]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [137]:
num_pipeline = Pipeline(steps = [
                ('impute', SimpleImputer(strategy='mean' )),
                ('scale', StandardScaler())
])

In [138]:
from sklearn.preprocessing import OneHotEncoder

In [139]:
cat_pipeline = Pipeline(steps = [
('impute', SimpleImputer(strategy='most_frequent')),
('one-hot-encoder', OneHotEncoder(handle_unknown='ignore', 
                                  sparse_output=False))
])

In [140]:
from sklearn. compose import ColumnTransformer

In [141]:
col_transformer = ColumnTransformer(transformers = [
    ('num_pipeline', num_pipeline, num_cols),
    ('cat_pipeline', cat_pipeline, cat_cols),
],
    remainder = 'drop',
    n_jobs = -1
)

In [142]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()

In [143]:
pipe_final = make_pipeline(col_transformer, dtc)

In [144]:
pipe_final.fit(X_train, y_train)

In [44]:
pipe_final.score(X_test,y_test)

0.3333333333333333

# Pipeline Practise

In [146]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

In [147]:
# Random dataset generation
np.random.seed(42)
data = np.random.rand(149, 3)  # 3 numeric columns
categories = np.random.choice(['A', 'B', 'C'], 149)  # Categorical feature
target = np.random.randint(0, 2, 149)  # Binary target: 0 or 1

In [148]:
# Introduce some missing values in numeric data
data[5:10, 1] = np.nan
data[50:55, 2] = np.nan

In [149]:
# Create DataFrame
df = pd.DataFrame(data, columns=['Feature1', 'Feature2', 'Feature3'])
df['Category'] = categories  # Add categorical column
df['Target'] = target  # Add target column
df

Unnamed: 0,Feature1,Feature2,Feature3,Category,Target
0,0.374540,0.950714,0.731994,B,1
1,0.598658,0.156019,0.155995,C,1
2,0.058084,0.866176,0.601115,C,1
3,0.708073,0.020584,0.969910,A,1
4,0.832443,0.212339,0.181825,B,0
...,...,...,...,...,...
144,0.618218,0.101123,0.084107,A,1
145,0.700969,0.072763,0.821860,A,0
146,0.706242,0.081349,0.084838,B,0
147,0.986640,0.374271,0.370642,A,1


In [150]:
# Define preprocessing for numeric columns (Imputation + Scaling)
numeric_features = ['Feature1', 'Feature2', 'Feature3']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  
    # Fill missing values with mean
    ('scaler', StandardScaler())  
    # Standardize the numeric features
])

In [151]:
# Define preprocessing for categorical columns (OneHotEncoding)
categorical_features = ['Category']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  
    # Encode categorical features
])

In [152]:
# Combine numeric and categorical pipelines
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),  
    # Apply numeric transformations
    ('cat', categorical_transformer, categorical_features)  
    # Apply categorical transformations
])

In [153]:
# Advanced Features Pipeline 
features_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  
    # Preprocess both numeric and categorical features
    ('pca', PCA(n_components=2))  
    # Reduce dimensions with PCA
])

In [154]:
"""
# Target Pipeline (For binary classification)
target_pipeline = Pipeline(steps=[
    ('label_encoder', LabelEncoder())  
    # Encode target labels if needed
])
"""

"\n# Target Pipeline (For binary classification)\ntarget_pipeline = Pipeline(steps=[\n    ('label_encoder', LabelEncoder())  \n    # Encode target labels if needed\n])\n"

In [155]:
X = df.drop('Target', axis=1)
y = df['Target']


In [156]:
# Fit and transform the features using the features pipeline
X_transformed = features_pipeline.fit_transform(X)

# Fit and transform the target using the target pipeline
# y_transformed = target_pipeline.fit_transform(y)


In [157]:
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_transformed, 
                                                    y, 
                                                    test_size=0.2, 
                                                    random_state=42)

# Check the transformed feature data and target labels

In [158]:
print(X_train[:5])  
# Display some of the transformed feature data (after scaling, PCA)

[[-0.11943263 -0.41733872]
 [ 0.03878385 -0.29716875]
 [-0.25914689  1.70755956]
 [ 1.87299254  1.52270719]
 [ 1.87959031 -0.96829956]]


In [159]:
print(y_train[:5]) 
# Display some of the transformed target labels

22    0
15    1
65    1
11    1
42    1
Name: Target, dtype: int32


In [160]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [161]:
clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train, y_train)

In [107]:
y_pred = clf.predict(X_test)

In [108]:
acc_score = accuracy_score(y_pred,y_test)
acc_score

0.6

In [109]:
cm = confusion_matrix(y_pred,y_test)
cm

array([[10,  4],
       [ 8,  8]], dtype=int64)