<a href="https://colab.research.google.com/github/Mulat-K/Machine-Learning-Mastery-with-Python/blob/main/AMLWWP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

***Automate Machine Learning
 Work ows with Pipelines***

# ***Data Preparation and Modeling Pipeline***

In [10]:
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import pandas as pd

# Load data
filename = '/content/sample_data/pima-indians-diabetes.data.csv'  # Ensure correct file path
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names, header=None)

# Convert all values to numeric (if not already)
dataframe = dataframe.apply(pd.to_numeric, errors='coerce')  # Converts non-numeric values to NaN

# Handle missing values (NaNs) by replacing them with the mean of each column
dataframe.fillna(dataframe.mean(), inplace=True)

# Convert to numpy array
array = dataframe.values
X = array[:, 0:8]  # Features
Y = array[:, 8]    # Target

# Create pipeline
estimators = []
estimators.append(('standardize', StandardScaler()))  # Standardize the data
estimators.append(('lda', LinearDiscriminantAnalysis()))  # Apply Linear Discriminant Analysis
model = Pipeline(estimators)

# Evaluate pipeline using KFold cross-validation
kfold = KFold(n_splits=10, random_state=7, shuffle=True)  # Ensure shuffle=True to avoid warnings
results = cross_val_score(model, X, Y, cv=kfold)

# Print the mean accuracy of the cross-validation
print("Mean accuracy:", results.mean())


Mean accuracy: nan


Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_scorer.py", line 144, in __call__
    score = scorer(estimator, *args, **routed_params.get(name).score)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_scorer.py", line 472, in __call__
    return estimator.score(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/pipeline.py", line 1199, in score
    return self.steps[-1][1].score(Xt, y, **score_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 572, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/_param_validation.py", line 216, in 

# ***Feature Extraction and Modeling Pipeline***

In [13]:
from pandas import read_csv
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import StandardScaler

# Load data (this time let pandas use the header from the CSV)
filename = '/content/sample_data/pima-indians-diabetes.data.csv'
dataframe = read_csv(filename)  # <- No header=None, no names=...
array = dataframe.values
X = array[:, 0:8]
Y = array[:, 8]

# Create feature union
features = []
features.append(('pca', PCA(n_components=3)))
features.append(('select_best', SelectKBest(k=6)))
feature_union = FeatureUnion(features)

# Create pipeline
estimators = []
estimators.append(('scaler', StandardScaler()))  # Scale for PCA and LogisticRegression
estimators.append(('feature_union', feature_union))
estimators.append(('logistic', LogisticRegression(max_iter=1000)))  # More iterations
model = Pipeline(estimators)

# Evaluate pipeline
kfold = KFold(n_splits=10, random_state=7, shuffle=True)  # shuffle=True fixes the earlier warning
results = cross_val_score(model, X, Y, cv=kfold)

print(results.mean())


0.7747607655502392
