In [39]:
import os
import pickle as pkl
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [9]:
ls

01_data_ingestion.ipynb       04_data_training.ipynb
02_data_validation.ipynb      trials.ipynb
03_data_transformation.ipynb


In [10]:
cd ..


/Users/gojuruakshith/YouTube-Fake-Thumbnail-Detector


In [27]:
df =pd.read_csv("artifacts/data_transformation/train.csv")
df[("clickbait")].values.ravel()

array([0, 1, 0, ..., 1, 1, 0], shape=(25600,))

In [34]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainerConfig:
  root_dir: Path
  train_data_path: Path
  test_data_path: Path
  model_name: str
  l1_ratio: float
  target_column:str



In [36]:
from src.mlProject.constants import *
from src.mlProject.utils.common import read_yaml,create_directories
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAM_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

    def get_model_trainer_config(self)->ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.LogisticRegression
        schema =  self.schema.TARGET_COLUMN

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            train_data_path = config.train_data_path,
            test_data_path = config.test_data_path,
            model_name = config.model_name,
            l1_ratio = params.l1_ratio,
            target_column = schema.name
        )
        return model_trainer_config


In [49]:
from sklearn.linear_model import ElasticNet
from sklearn.feature_extraction.text import TfidfVectorizer

class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config


    def train(self):
        train_data = pd.read_csv(self.config.train_data_path)
        test_data = pd.read_csv(self.config.test_data_path)


        X_train = train_data["headline"].astype(str)
        y_train = train_data[self.config.target_column]

        model = LogisticRegression(
            l1_ratio=self.config.l1_ratio,
            random_state=42,
            solver='saga',
            max_iter=1000
        )
        pipe = Pipeline([
            ("vectorizer",TfidfVectorizer(ngram_range=(1,2),stop_words='english')),
            ("model",model)
        ])

        pipe.fit(X_train,y_train)
        with open(os.path.join(self.config.root_dir, self.config.model_name), "wb") as f:
              pkl.dump(pipe, f)



In [50]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer_config = ModelTrainer(config=model_trainer_config)
    model_trainer_config.train()
except Exception as e:
    raise e

[2026-01-07 21:31:41,842: INFO: common: YAML file loaded successfully: config/config.yaml]
[2026-01-07 21:31:41,844: INFO: common: YAML file loaded successfully: params.yaml]
[2026-01-07 21:31:41,845: INFO: common: YAML file loaded successfully: schema.yaml]
[2026-01-07 21:31:41,846: INFO: common: Created directory at: artifacts/model_trainer]
