In [1]:
import mlflow

print(mlflow.get_tracking_uri())


file:///c:/Users/meetp/%23PYTHON%20FILES/Customer%20Segmentation%20Clustering/src/FinalModel/mlruns


### Pipline Creation 

In [27]:
from sklearn.pipeline import Pipeline
from src.components.data_tranformation import Date_Encoding
from sklearn.preprocessing import OneHotEncoder 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler 
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import pandas as pd
import pickle

data = pd.read_csv(r'C:\Users\meetp\#PYTHON FILES\Customer Segmentation Clustering\artifacts\marketing_cleaned.csv')
df = data.copy()

# Define pipeline steps with RobustScaler applied to the whole dataset
pipeline = Pipeline([
    ('date_transformer', Date_Encoding()),  # Convert datetime to numeric
    ('preprocessor', ColumnTransformer([
        ('categorical', OneHotEncoder(), ['Education', 'Marital_Status']),  # One-hot encoding for categorical columns
    ], remainder='passthrough')),  # Remainder='passthrough' preserves the columns not specified in transformers
    ('scaler', RobustScaler()),  # RobustScaler applied to the whole dataset
    ('pca', PCA(n_components=2)),  # PCA with 2 components
    ('kmeans', KMeans(n_clusters=3, random_state=0, init = 'k-means++'))  # KMeans clustering with 3 clusters
])


pipeline.fit_transform(df)

AttributeError: Can only use .dt accessor with datetimelike values

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

pca_components = pipeline.named_steps['pca'].transform(pipeline.named_steps['scaler'].transform(pipeline.named_steps['preprocessor'].transform(pipeline.named_steps['date_transformer'].transform(df))))

# Create a DataFrame with PCA components
pca_df = pd.DataFrame(data=pca_components, columns=["PCA1", "PCA2"])

# Add cluster labels to the DataFrame
pca_df['Cluster'] = pipeline.named_steps['kmeans'].labels_

# Plot the clusters
plt.figure(figsize=(8, 8))
sns.scatterplot(x="PCA1", y="PCA2", hue="Cluster", data=pca_df, palette='viridis')
plt.title("Clustering using K-Means Algorithm")
plt.show()

In [None]:
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score, silhouette_score

kmeans_labels = pipeline.named_steps['kmeans'].labels_

print(silhouette_score(pca_df, kmeans_labels))
print(calinski_harabasz_score(pca_df, kmeans_labels))
print(davies_bouldin_score(pca_df, kmeans_labels))

In [None]:
with open(r'kmeans_pipeline.pkl', 'wb') as f:
    pickle.dump(pipeline, f)

### Prediction

#### 1) singel input (Prediction)

In [None]:
from datetime import datetime
import pandas as pd

# Load the entire pipeline
with open(r'kmeans_pipeline.pkl', 'rb') as f:
    loaded_pipeline = pickle.load(f)

# Creating a dictionary with column names and random values
df2 = {
    'Age': [32],
    'Education': ['Graduation'],
    'Marital_Status': ['Partner'],
    'Income': [50000.00],
    'Kidhome' : [2],
    'Teenhome' : [1],
    'Dt_Customer': [datetime(2023, 5, 15)],  # Example date
    'Recency': [25],
    'MntWines': [10],
    'MntFruits': [23],
    'MntMeatProducts': [50],
    'MntFishProducts': [25],
    'MntSweetProducts': [8],
    'MntGoldProds': [5],
    'NumDealsPurchases': [8],
    'NumWebPurchases': [15],
    'NumCatalogPurchases': [16],
    'NumStorePurchases': [23],
    'NumWebVisitsMonth': [8],
    'AcceptedCmp3': [0],
    'AcceptedCmp4': [0],
    'AcceptedCmp5': [0],
    'AcceptedCmp1': [1],
    'AcceptedCmp2': [0],
    'Response': [1]
}

# Create a DataFrame
test = pd.DataFrame(df2)

cluster = loaded_pipeline.predict(test)[0]
cluster


#### 2) Multi Input Prediction

In [None]:
import pandas as pd
import pickle

data = pd.read_excel(r'C:\Users\meetp\Downloads\!PYTHON FILES\MLops-Project\artifacts\marketing_cleaned.xlsx')
df = data.copy()

with open(r'kmeans_pipeline.pkl', 'rb') as f:
    pipeline = pickle.load(f)

cluster = pipeline.predict(data)
cluster


#### Getting Essential data components for evaluation using pipline

In [None]:
kmeans_labels = pipeline.named_steps['kmeans'].labels_
pca_components = pipeline.named_steps['pca'].transform(pipeline.named_steps['scaler'].transform(pipeline.named_steps['preprocessor'].transform(pipeline.named_steps['date_transformer'].transform(df))))
pca_df = pd.DataFrame(data=pca_components, columns=["PCA1", "PCA2"])
pca_df['Cluster'] = pipeline.named_steps['kmeans'].labels_


from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score, silhouette_score

print(silhouette_score(pca_df, kmeans_labels))
print(calinski_harabasz_score(pca_df, kmeans_labels))
print(davies_bouldin_score(pca_df, kmeans_labels))

In [None]:
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score, silhouette_score

print(silhouette_score(pca_df, kmeans_labels))
print(calinski_harabasz_score(pca_df, kmeans_labels))
print(davies_bouldin_score(pca_df, kmeans_labels))