In [None]:
import os
import pandas as pd
import joblib
from category_encoders import TargetEncoder

from src.churn import logging


from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder




In [None]:
class DataTransformation:
    def __init__(self, df):
        self.df = df
        self.train, self.test = train_test_split(self.df, test_size=0.7, random_state=42)
    def transform_data(df):
        """
        Performs data transformations on the input DataFrame.
        """

        # Convert data to appropriate dtypes
        numerical_columns = ['age', 'days_since_last_login', 'avg_time_spent',
                            'avg_transaction_value', 'avg_frequency_login_days', 'points_in_wallet', 'churn_risk_score']
        df[numerical_columns] = df[numerical_columns].apply(pd.to_numeric, errors='coerce')
        df['last_visit_time'] = pd.to_datetime(df['last_visit_time'], format='%H:%M:%S')
        categorical_columns = ['gender', 'region_category', 'membership_category',
                            'joined_through_referral', 'preferred_offer_types', 'medium_of_operation',
                            'internet_option', 'used_special_discount', 'offer_application_preference',
                            'past_complaint', 'complaint_status', 'feedback']
        df[categorical_columns] = df[categorical_columns].astype('object')
        df['joining_date'] = pd.to_datetime(df['joining_date'])

        # Impute missing values
        # Iterative Imputer for numerical columns
        target_column = 'churn_risk_score'
        numeric_columns = df.select_dtypes(include='number').columns.drop(target_column)
        scaler = StandardScaler()
        df_scaled = df.copy()
        df_scaled[numeric_columns] = scaler.fit_transform(df[numeric_columns])
        iterative_imputer = IterativeImputer(random_state=42)
        df_scaled[numeric_columns] = iterative_imputer.fit_transform(df_scaled[numeric_columns])
        df[numeric_columns] = scaler.inverse_transform(df_scaled[numeric_columns])

        # KNN Imputer for categorical columns
        df['gender'] = df['gender'].replace('Unknown', np.nan)
        categorical_columns = ['gender', 'region_category', 'joined_through_referral', 'medium_of_operation',
                            'preferred_offer_types']
        encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
        encoder.fit(df[categorical_columns])
        df[categorical_columns] = encoder.transform(df[categorical_columns])
        imputer = KNNImputer(n_neighbors=5, metric='nan_euclidean', weights='distance')
        df[categorical_columns] = imputer.fit_transform(df[categorical_columns])
        for col in categorical_columns:
            df[col] = df[col].astype('object')

        # Feature Engineering
        specific_date = datetime(2024, 5, 17)
        df['tenure_months'] = ((specific_date.year - df['joining_date'].dt.year) * 12 +
                            (specific_date.month - df['joining_date'].dt.month)).astype('int64')
        df['visit_hour'] = df['last_visit_time'].dt.hour.astype('int64')
        df['login_spend_ratio'] = df['avg_time_spent'] / df['avg_frequency_login_days']
        df['login_transaction_ratio'] = df['avg_frequency_login_days'] / df['avg_transaction_value']

        # Target column class distribution
        mapping = {
            -1: 0,
            0: 0,
            2: 0,
            3: 1,
            4: 2,
            5: 2
        }

        df['churn_risk_score'] = df['churn_risk_score'].map(mapping)

        df = df.drop(columns=['joining_date', 'last_visit_time'])

        # Rename columns
        rename_mapping = {
            'avg_frequency_login_days': 'frequency',
            'avg_transaction_value': 'monetary',
            'days_since_last_login': 'recency'
        }

        df = df.rename(columns=rename_mapping)

        return df

    

    
    
    