In [1]:
!pip install hydra-core omegaconf 

Collecting hydra-core
  Downloading hydra_core-1.3.2-py3-none-any.whl.metadata (5.5 kB)
Collecting omegaconf
  Downloading omegaconf-2.3.0-py3-none-any.whl.metadata (3.9 kB)
Collecting antlr4-python3-runtime==4.9.* (from hydra-core)
  Downloading antlr4-python3-runtime-4.9.3.tar.gz (117 kB)
  Preparing metadata (setup.py) ... [?25ldone
Downloading hydra_core-1.3.2-py3-none-any.whl (154 kB)
Downloading omegaconf-2.3.0-py3-none-any.whl (79 kB)
Building wheels for collected packages: antlr4-python3-runtime
  Building wheel for antlr4-python3-runtime (setup.py) ... [?25ldone
[?25h  Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.9.3-py3-none-any.whl size=144591 sha256=acd615fb7bb8cfdf8feb5074dda0464d303ea8a2c427caec0bc30fe4dd57e641
  Stored in directory: /teamspace/studios/this_studio/.cache/pip/wheels/12/93/dd/1f6a127edc45659556564c5730f6d4e300888f4bca2d4c5a88
Successfully built antlr4-python3-runtime
Installing collected packages: antlr4-python3-runtime, om

In [1]:
import yaml

yaml_content = """
data:
  raw_data_path: data/raw
  processed_data_path: data/processed
  train: data/raw/train.csv
  test: data/raw/test.csv
  id_column: PassengerId
  features: ["Pclass", "Sex", "SibSp", "Parch", "Age", "Fare", "Embarked"]
  target_column: Survived
  test_size: 0.2
  random_state: 42

preprocessing:
  num_features: ["Age", "Fare", "SibSp", "Parch"]
  cat_features: ["Pclass", "Sex", "Embarked"]
  num_strategy: "median"
  cat_strategy: "most_frequent"

model:
  model_name: "RandomForest"
  model_path: models
  trained_model_path: ${pipeline.model.model_path}/${pipeline.model.model_name}
  params:
    n_estimators: 100
    max_depth: 10
    random_state: 42

evaluate:
  model_name: ${pipeline.model.model_name}
  model_path: ${pipeline.model.model_path}
  trained_model_path:  ${pipeline.model.trained_model_path}
  reports_path: reports
"""

# Save the YAML content to a file
with open("config.yaml", "w") as f:
    f.write(yaml_content)

In [2]:
# import hydra
# hydra.core.global_hydra.GlobalHydra.instance().clear()


In [3]:
from hydra import initialize, compose

# Initialize Hydra and load config
initialize(config_path=".", version_base=None)  
cfg = compose(config_name="config")  

# Access the configuration values
print(cfg.data.raw_data_path)  
print(cfg.model.model_name)    


data/raw
RandomForest


In [4]:
print(cfg.data.train)    


data/raw/train.csv


In [5]:
import pandas as pd
# Access the full file paths from config
train_file_path = cfg.data.train  
test_file_path = cfg.data.test   

In [6]:
print(train_file_path)
print(test_file_path)


data/raw/train.csv
data/raw/test.csv


In [7]:
# If CSVs are available:
train = pd.read_csv(train_file_path)
test = pd.read_csv(test_file_path)

# Drop unused column
train.drop("Cabin", axis=1, inplace=True)
test.drop("Cabin", axis=1, inplace=True)

In [8]:
X = train[cfg.data.features]
y = train[cfg.data.target_column]
X_test = test[cfg.data.features]

from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=cfg.data.test_size,
    random_state=cfg.data.random_state
)


In [22]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

num_transformer = SimpleImputer(strategy=cfg.preprocessing.num_strategy)
cat_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy=cfg.preprocessing.cat_strategy)),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", num_transformer, list(cfg.preprocessing.num_features)),
    ("cat", cat_transformer, list(cfg.preprocessing.cat_features))
])


In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

model_name = cfg.model.model_name  

if model_name == "RandomForest":
    model = RandomForestClassifier(
        n_estimators=cfg.model.params.n_estimators,
        max_depth=cfg.model.params.max_depth,
        random_state=cfg.model.params.random_state
    )
elif model_name == "LogisticRegression":
    model = LogisticRegression(max_iter=1000)
else:
    raise ValueError("Unsupported model specified in config.")


In [24]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", model)
])

# Fit pipeline
pipeline.fit(X_train, y_train)

# Predict
y_pred = pipeline.predict(X_val)

# Evaluate
print(f"\nModel: {model_name}")
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Classification Report:\n", classification_report(y_val, y_pred))

# Debug: Confirm column names
print("X_train columns:", X_train.columns.tolist())
print("X_val columns:", X_val.columns.tolist())



Model: RandomForest
Accuracy: 0.8268156424581006
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.90      0.86       105
           1       0.83      0.73      0.78        74

    accuracy                           0.83       179
   macro avg       0.83      0.81      0.82       179
weighted avg       0.83      0.83      0.82       179

X_train columns: ['Pclass', 'Sex', 'SibSp', 'Parch', 'Age', 'Fare', 'Embarked']
X_val columns: ['Pclass', 'Sex', 'SibSp', 'Parch', 'Age', 'Fare', 'Embarked']


In [25]:
# Predict on test data using the trained pipeline
test_predictions = pipeline.predict(X_test)
print("Test predictions complete.")


Test predictions complete.


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Drop unused column
train.drop("Cabin", axis=1, inplace=True)
test.drop("Cabin", axis=1, inplace=True)

# Features and target
features = ["Pclass", "Sex", "SibSp", "Parch", "Age", "Fare", "Embarked"]
target = "Survived"

X = train[features]
y = train[target]
X_test = test[features]

# Define numerical and categorical columns
num_features = ["Age", "Fare", "SibSp", "Parch"]
cat_features = ["Pclass", "Sex", "Embarked"]

# Define transformers
num_transformer = SimpleImputer(strategy="median")
cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

# Combine transformers
preprocessor = ColumnTransformer(transformers=[
    ("num", num_transformer, num_features),
    ("cat", cat_transformer, cat_features)
])

# Define two models
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000)
}

# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate each model
for name, model in models.items():
    pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("classifier", model)
    ])
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)
    
    print(f"\nModel: {name}")
    print("Accuracy:", accuracy_score(y_val, y_pred))
    print("Classification Report:\n", classification_report(y_val, y_pred))
    
    # Track best model
    best_pipeline = None
    best_model_name = None
    best_accuracy = 0
    
    # Train and evaluate each model
    for name, model in models.items():
        pipeline = Pipeline(steps=[
            ("preprocessor", preprocessor),
            ("classifier", model)
        ])
        
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_val)
        acc = accuracy_score(y_val, y_pred)
    
        if acc > best_accuracy:
            best_accuracy = acc
            best_pipeline = pipeline
            best_model_name = name
    
    # Predict on test data using the best-performing model
    test_predictions = best_pipeline.predict(X_test)
    
    print(f"\nTest predictions made using best model: {best_model_name}")




Model: Random Forest
Accuracy: 0.8268156424581006
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.88      0.86       105
           1       0.81      0.76      0.78        74

    accuracy                           0.83       179
   macro avg       0.82      0.82      0.82       179
weighted avg       0.83      0.83      0.83       179


Test predictions made using best model: Random Forest

Model: Logistic Regression
Accuracy: 0.7988826815642458
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.86      0.83       105
           1       0.78      0.72      0.75        74

    accuracy                           0.80       179
   macro avg       0.80      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179


Test predictions made using best model: Random Forest
