In [13]:
import os
os.chdir('../')

In [14]:
%pwd

'c:\\Chandu\\WorkSpace\\Learnings\\Repos\\End-to-End-TitanicSurvivalProject'

In [15]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class ModelTrainerConfig:
    root_dir: Path
    data_path: Path

In [16]:
from titanicSurvival.constants import *
from titanicSurvival.utils.common import  read_yaml, create_directories

In [17]:
class ConfigurationManager:
    def __init__(self,
                 config_file_path= CONFIG_FILE_PATH,
                 params_file_path= PARAMS_FILE_PATH):
        self.config = read_yaml(config_file_path)
        self.params = read_yaml(params_file_path)

        create_directories([self.config.artifacts_root])
    def get_model_trainer(self) -> ModelTrainerConfig:
        config = self.config.model_trainer

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            data_path=config.data_path
        )
        return model_trainer_config

In [18]:
import os
import urllib.request as request
import zipfile
from titanicSurvival.logging import logger
from titanicSurvival.utils.common import get_size
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import classification_report
from typing import Any

In [19]:
from sklearn.linear_model import LogisticRegression

In [20]:
import joblib

In [21]:
class ModelTrainer:
    def __init__(self,config:ModelTrainerConfig):
        self.config =config
    
    def get_transformed_data(self) -> tuple[np.array, np.array, np.array, np.array]:
        try:
            X_train_df = pd.read_csv(os.path.join('artifacts/data_transformation/','train_features_x.csv'))
            y_train_df = pd.read_csv(os.path.join('artifacts/data_transformation/','train_features_y.csv'))
            X_test_df = pd.read_csv(os.path.join('artifacts/data_transformation/','test_features_x.csv'))
            y_test_df = pd.read_csv(os.path.join('artifacts/data_transformation/','test_features_y.csv'))
            
            logger.info(f"Cocatenated train and test data")
            print(X_train_df.head())
            return X_train_df.to_numpy(),y_train_df.to_numpy(),X_test_df.to_numpy(),y_test_df.to_numpy()
        except Exception as e:
            raise e
    
    
    def fit_model(self,X_train,y_train,X_test,y_test):

        X_train, y_train, X_test, y_test = self.get_transformed_data()

        lr = LogisticRegression()

        lr.fit(X_train,y_train)
        logger.info(f"{ classification_report(y_test,lr.predict(X_test))}")

        joblib.dump(lr,os.path.join(self.config.root_dir,'model.pkl'))
        logger.info(f"Model Save to path: {self.config.root_dir}")

            


In [22]:
# try:
#     config = ConfigurationManager()
#     modletrainer_config = config.get_model_trainer()
#     model_trainer=ModelTrainer(config=modletrainer_config)
#     X_train,y_train,X_test,y_test =model_trainer.get_transformed_data()
#     model_trainer.fit_model(X_train,y_train,X_test,y_test)
    
# except Exception as e:
#     raise e

In [23]:
from titanicSurvival.components.data_transformation import assignCabin, getAgeSubSection

In [31]:
def convert_input_data_exp_feature_vector(input_data):
    df  = pd.DataFrame(input_data)
    df['Sex'] = df['Sex'].fillna(df['Sex'].mode()[0]).map({'female':0, 'male': 1 })
    df['Age']=df['Age'].fillna(df['Age'].mode()[0])
    df['Age']=df['Age'].apply(lambda x: getAgeSubSection(x))
    df['Embarked']=df['Embarked'].fillna(df['Embarked'].mode()[0])
    df['Embarked'] = (df['Embarked'].map({'C':0, 'S':1, 'Q':2})).astype(int)
    
    df['Cabin']=df['Cabin'].fillna("X").map(lambda x: x[0])
    df['Cabin']=df.apply(lambda row: assignCabin(row),axis=1)
    print(f"Cabin data after assigned values: {df['Cabin'].head()}")

    cabinEncode = OrdinalEncoder()
    df['Cabin']=(cabinEncode.fit_transform(df[['Cabin']])).astype(int)
    df.drop(columns=['PassengerId','Name','Ticket','Fare'],inplace=True,axis=1)
    #print(df['Cabin'].isnull().sum())
    print(df.head())
    df_np = df.to_numpy()
    return df_np

In [25]:
from io import StringIO
import pandas as pd
df_eval = pd.DataFrame(StringIO("""PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q"""))

In [34]:
import pandas as pd

data = {
    "PassengerId": [893],
    "Pclass": [3],
    "Name": ["Wilkes, Mrs. James (Ellen Needs)"],
    "Sex": ["female"],
    "Age": [47],
    "SibSp": [1],
    "Parch": [0],
    "Ticket": ["363272"],
    "Fare": [7],
    "Cabin": [None],  # Missing value
    "Embarked": ["S"]
}

df_eval = pd.DataFrame(data)
print(df_eval)


   PassengerId  Pclass                              Name  ... Fare  Cabin  Embarked
0          893       3  Wilkes, Mrs. James (Ellen Needs)  ...    7   None         S

[1 rows x 11 columns]


In [37]:
df_eval = pd.read_csv('./artifacts/data_ingestion/test.csv')
df_eval

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [49]:
y_target = pd.read_csv('./artifacts/data_ingestion/gender_submission.csv')
y_target.drop(columns=['PassengerId'],inplace=True,axis=1)
y_target.head(2)
y_target_np = y_target.to_numpy()

In [51]:
try:
    config = ConfigurationManager()
    modletrainer_config = config.get_model_trainer()
    features = convert_input_data_exp_feature_vector(df_eval)
    model_load = joblib.load(os.path.join(modletrainer_config.root_dir,'model.pkl'))
    predicted_out =model_load.predict(features)

    print(classification_report(y_target_np[:10],predicted_out[:10] ,target_names=['Not Survived','Survived']))
    #print(features)
except Exception as e:
    raise e

[2025-04-01 07:25:39,548: INFO: common: yaml file: config\config.yaml loaded succesfully..]
[2025-04-01 07:25:39,555: INFO: common: yaml file: params.yaml loaded succesfully..]
[2025-04-01 07:25:39,560: INFO: common: Create Directory at :artifacts]
[2025-04-01 07:25:39,566: INFO: common: Create Directory at :artifacts/model_trainer]


Cabin data after assigned values: 0    G
1    G
2    F
3    G
4    G
Name: Cabin, dtype: object
   Pclass  Sex  Age  SibSp  Parch  Cabin  Embarked
0       3    1    2      0      0      6         2
1       3    0    3      1      0      6         1
2       2    1    5      0      0      5         2
3       3    1    1      0      0      6         1
4       3    0    1      1      1      6         1
              precision    recall  f1-score   support

Not Survived       0.86      1.00      0.92         6
    Survived       1.00      0.75      0.86         4

    accuracy                           0.90        10
   macro avg       0.93      0.88      0.89        10
weighted avg       0.91      0.90      0.90        10

