In [1]:
import os

In [2]:
%pwd

'd:\\Machine Learning\\VisaEligibilityPredictor-AU\\research'

In [3]:
os.chdir("../")

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataTransformationConfig:
      root_dir: Path
      file_path: Path

In [6]:
from src.visaPrediction.utils import *
from src.visaPrediction.constants import *


class ConfigurationManager:
      def __init__(
            self,
            config = CONFIG_FILE_PATH,
            schema = SCHEMA_FILE_PATH,
            params = PARAMS_FILE_PATH
      ):
            self.config = load_yaml_file(config)
            self.schema = load_yaml_file(schema)
            self.params = load_yaml_file(params)

            create_directory([self.config.artifact_root])
      
      def get_data_transformation_config(self) -> DataTransformationConfig:
            config = self.config.data_transformation

            create_directory([config.root_dir])

            data_transformation_config = DataTransformationConfig(
                  root_dir=config.root_dir,
                  file_path=config.file_path
            )

            return data_transformation_config

In [29]:
import os
import joblib
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder


class DataTransformation:
      def __init__(self, config: DataTransformationConfig):
            self.config = config


      def clean_data(self):
            df = pd.read_csv(self.config.file_path)

            df.drop(columns='rejection_reason', inplace=True)

            df['documents_submitted'] = df['documents_submitted'].str.replace(',', ' ', regex=True).str.lower()
            return df


      def transform_data(self):
            df = self.clean_data()

            encoder = LabelEncoder()
            df['visa_granted'] = encoder.fit_transform(df['visa_granted'])

            X = df.drop(columns='visa_granted')
            y = df['visa_granted']

            categorical_features = ['gender', 'visa_type']

            preprocessor = ColumnTransformer(
                  transformers= [
                        ('text', TfidfVectorizer(), 'documents_submitted'),
                        ('num', StandardScaler(), ['age']),
                        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
                  ]
            )
            x = preprocessor.fit_transform(X)
            y = df['visa_granted']

            return x, y
      
      
      def split_data(self):
            x, y = self.transform_data()

            x_train, x_val, y_train, y_val = train_test_split(
                  x,
                  y,
                  test_size=0.3
            )

            joblib.dump(x_train, os.path.join(self.config.root_dir, "x_train.pkl"))
            joblib.dump(x_val, os.path.join(self.config.root_dir, "x_val.pkl"))
            joblib.dump(y_train, os.path.join(self.config.root_dir, "y_train.pkl"))
            joblib.dump(y_val, os.path.join(self.config.root_dir, "y_val.pkl"))

            logger.info("Split data successfully")

In [30]:
try:
      config = ConfigurationManager()
      data_transformation_config = config.get_data_transformation_config()
      data_transform = DataTransformation(data_transformation_config)
      data_transform.split_data()
except Exception as e:
      raise e

[2026-02-14 10:32:28,474: INFO: __init__: yaml file: config\config.yaml loaded successfully]
[2026-02-14 10:32:28,477: INFO: __init__: yaml file: schema.yaml loaded successfully]
[2026-02-14 10:32:28,482: INFO: __init__: yaml file: params.yaml loaded successfully]
[2026-02-14 10:32:28,487: INFO: __init__: File directory: artifact created successfully]
[2026-02-14 10:32:28,490: INFO: __init__: File directory: artifact/data_transformation created successfully]
[2026-02-14 10:32:28,730: INFO: 95242779: Split data successfully]
