In [1]:
import os
os.chdir('../')

In [2]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [3]:
from titanicSurvival.constants import *
from titanicSurvival.utils.common import  read_yaml, create_directories

In [4]:
class ConfigurationManager:
    def __init__(self,
                 config_file_path= CONFIG_FILE_PATH,
                 params_file_path= PARAMS_FILE_PATH):
        self.config = read_yaml(config_file_path)
        self.params = read_yaml(params_file_path)

        create_directories([self.config.artifacts_root])
    def get_data_transformation(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_tranformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            
        )
        return data_tranformation_config

In [8]:
import os
import urllib.request as request
import zipfile
from titanicSurvival.logging import logger
from titanicSurvival.utils.common import get_size
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder


In [14]:
def getAgeSubSection(age):
    #print(age)
    if age > 60:
        return 5
    elif age > 50:
        return 4
    elif age > 40:
        return 3
    elif age > 30:
        return 2
    #elif age > 20:
    #   return 2
    elif age > 15:
        return 1
    else:
        return 0
def assignCabin(row):
    if row.Cabin == 'X':
        if row.Pclass == 1 :
            return 'C'
        elif row.Pclass == 2:
            return 'F'
        else:
            return 'G'
    else:
        return row.Cabin

In [None]:
class DataTransformation:
    def __init__(self,config:DataTransformationConfig):
        self.config =config
    
    def combine_train_test_data(self):
        try:
            df = pd.read_csv(os.path.join('artifacts/data_ingestion/','train.csv'))
            target_df = pd.read_csv(os.path.join('artifacts/data_ingestion/','gender_submission.csv'))
            test_df = pd.read_csv(os.path.join('artifacts/data_ingestion/','test.csv'))
            test_df.insert(1,'Survived',target_df['Survived'])
            df = pd.concat([df, test_df])
            logger.info(f"Cocatenated train and test data")
            return df
        except Exception as e:
            raise e
    
    
    def create_features(self):

        df = self.combine_train_test_data()
        
        df['Sex'] = df['Sex'].fillna(df['Sex'].mode()[0]).map({'female':0, 'male': 1 })
        df['Age']=df['Age'].fillna(df['Age'].mode()[0])
        df['Age']=df['Age'].apply(lambda x: getAgeSubSection(x))
        df['Embarked']=df['Embarked'].fillna(df['Embarked'].mode()[0])
        df['Embarked'] = (df['Embarked'].map({'C':0, 'S':1, 'Q':2})).astype(int)
        
        df['Cabin']=df.apply(lambda row: assignCabin(row),axis=1)


        cabinEncode = OrdinalEncoder()
        df['Cabin']=(cabinEncode.fit_transform(df[['Cabin']])).astype(int)
        #print(df['Cabin'].isnull().sum())
        df_np = df.to_numpy()

        X_train,y_train = df_np[:-418,1:],df_np[:-418,0]
        X_test,y_test = df_np[-418:,1:],df_np[-418:,0]

        X_train_path = os.path.join(self.config.root_dir, 'train_features_x.csv')
        pd.DataFrame(X_train).to_csv(X_train_path,index=False)

        y_train_path = os.path.join(self.config.root_dir, 'train_features_y.csv')
        pd.DataFrame(y_train).to_csv(y_train_path,index=False)

        logger.info(f"Train feature files created: {X_train_path} \n and {y_train_path}")
        X_test_path = os.path.join(self.config.root_dir, 'test_features_x.csv')
        pd.DataFrame(X_test).to_csv(X_test_path,index=False)

        y_test_path = os.path.join(self.config.root_dir, 'test_features_y.csv')
        pd.DataFrame(y_test).to_csv(y_test_path,index=False)
        logger.info(f"Test feature files created: {X_test_path} \n and {y_test_path}")


            


In [20]:
try:
    config = ConfigurationManager()
    dat_transforamtion_config = config.get_data_transformation()
    data_transformation = DataTransformation(config=dat_transforamtion_config)
    data_transformation.create_features()
except Exception as e:
    raise e

[2025-03-29 21:27:45,732: INFO: common: yaml file: config\config.yaml loaded succesfully..]
[2025-03-29 21:27:45,737: INFO: common: yaml file: params.yaml loaded succesfully..]
[2025-03-29 21:27:45,741: INFO: common: Create Directory at :artifacts]
[2025-03-29 21:27:45,746: INFO: common: Create Directory at :artifacts/data_transformation]
[2025-03-29 21:27:45,777: INFO: 3684417095: Cocatenated train and test data]
0
[2025-03-29 21:27:45,961: INFO: 3684417095: Train feature files created: artifacts/data_transformation\train_features_x.csv 
 and artifacts/data_transformation\train_features_y.csv]
[2025-03-29 21:27:45,989: INFO: 3684417095: Test feature files created: artifacts/data_transformation\test_features_x.csv 
 and artifacts/data_transformation\test_features_y.csv]


  df['Cabin']=(cabinEncode.fit_transform(df[['Cabin']])).astype(int)
