In [1]:
import os

In [2]:
%pwd

'c:\\Users\\User\\Desktop\\PROJECTS\\Student-predictor\\student app\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\User\\Desktop\\PROJECTS\\Student-predictor\\student app'

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    train_df: Path

@dataclass(frozen=True)
class TrainingConfig:
    root_dir: Path
    training_data: Path
    trained_model_path: Path


In [6]:
from studentApp.constants import *
from studentApp.utils.common import read_yaml, create_directories
import pandas as pd

In [7]:
class ConfigurationManager:
    def __init__(
        self, 
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        
        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=Path(config.root_dir),
            train_df=Path(config.train_df),
          
        )

        return  data_transformation_config

    def get_training_config(self):
        training = self.config.training
        training_data_path = 'artifacts/data_ingestion/train.csv'  # Replace with the correct file path
        training_root_dir = 'artifacts'  # Replace with the desired root directory path

        create_directories([Path(training_root_dir)])

        training_config = TrainingConfig(
            root_dir=Path(training_root_dir),
            training_data=Path(training_data_path),
            trained_model_path=Path(training.trained_model_path),

            # Add other configuration attributes as needed
        )

        return training_config


In [8]:
import os
import sys
from dataclasses import dataclass

from catboost import CatBoostRegressor
from sklearn.ensemble import (
    AdaBoostRegressor,
    GradientBoostingRegressor,
    RandomForestRegressor,
)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from studentApp.constants import *
from studentApp.utils.common import read_yaml, create_directories
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer


In [9]:
class DataTransformation:
    def __init__(self, config):
        self.config = config
        self.train_df = None
        self.train = None
        self.X = None
        self.y = None

    def transformation_data(self):
        train_df = pd.read_csv(self.config.train_df)

        X = train_df.drop(columns=['math_score'], axis=1)
        y = train_df['math_score']

        # Create Column Transformer with 3 types of transformers
        num_features = X.select_dtypes(exclude="object").columns
        cat_features = X.select_dtypes(include="object").columns

        numeric_transformer = StandardScaler()
        oh_transformer = OneHotEncoder()

        preprocessor = ColumnTransformer(
            [
                ("OneHotEncoder", oh_transformer, cat_features),
                ("StandardScaler", numeric_transformer, num_features),
            ]
        )
        self.X = preprocessor.fit_transform(X)
        self.y = y

        print("Data transformation complete")


In [19]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score, roc_auc_score, mean_squared_error, r2_score
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import Lasso, Ridge
from pathlib import Path
import joblib
import os
import pickle




class Training:
    def __init__(self, config):
        self.config = config
        self.models = {
            "Linear Regression": LinearRegression(),
            "Lasso": Lasso(),
            "Ridge": Ridge(),
            "K-Neighbors Regressor": KNeighborsRegressor(),
            "Decision Tree": DecisionTreeRegressor(),
            "Random Forest Regressor": RandomForestRegressor(),
            "XGBRegressor": XGBRegressor(),
            "CatBoosting Regressor": CatBoostRegressor(verbose=False),
            "AdaBoost Regressor": AdaBoostRegressor()
        }
        self.metrics = {
            "Mean Squared Error": mean_squared_error,
            "R2 Score": r2_score
        }

    def save_model(self, model_name, model):
        save_directory = self.config.trained_model_path
        save_path = save_directory / "model.pkl"
        with open(save_path, "wb") as f:
            pickle.dump(model, f)
        print(f"The best model '{model_name}' has been saved.")

    def evaluate_model(self, model, X, y):
        scores = {}
        for metric_name, metric_func in self.metrics.items():
            y_pred = model.predict(X)
            score = metric_func(y, y_pred)
            scores[metric_name] = score
        return scores

    def train(self, X_train, y_train):
        best_model = None
        best_model_name = ""
        best_scores = None
        total_models = len(self.models)
        current_model = 1

        for model_name, model in self.models.items():
            model.fit(X_train, y_train)  # Train model

            # Evaluate the model using multiple metrics
            scores = self.evaluate_model(model, X_train, y_train)

            if best_scores is None or all(score > best_scores[metric] for metric, score in scores.items()):
                best_scores = scores
                best_model_name = model_name
                best_model = model

            # Display metrics for all models
            print(f"Model: {model_name}")
            for metric, score in scores.items():
                print(f"- {metric}: {score:.4f}")

            # Calculate and display percentage progress
            progress = current_model / total_models * 100
            print(f"Training Progress: {progress:.2f}%")
            current_model += 1

        if best_model is not None:
            self.save_model(best_model_name, best_model)
            print("\nBest Model Scores:")
            for metric, score in best_scores.items():
                print(f"- {metric}: {score:.4f}")
        else:
            print("No best model found.")

In [20]:
# Integration of DataTransformation and Training classes
try:

    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.transformation_data()

    
    # data_transformation = DataTransformation(config)  # Create an instance of DataTransformation
    # data_transformation.transformation_data()  # Perform data transformation

    X_train = data_transformation.X
    y_train = data_transformation.y
    
    training_config = config.get_training_config()
    training = Training(config=training_config)
    # training = Training()  # Create an instance of Training
    training.train(X_train, y_train) 

except Exception as e:
    raise e

[2023-07-18 19:28:25,198: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-07-18 19:28:25,205: INFO: common: yaml file: params.yaml loaded successfully]
[2023-07-18 19:28:25,206: INFO: common: created directory at: artifacts]
[2023-07-18 19:28:25,213: INFO: common: created directory at: artifacts/data_ingestion]
Data transformation complete
[2023-07-18 19:28:25,288: INFO: common: created directory at: artifacts]
Model: Linear Regression
- Mean Squared Error: 29.1296
- R2 Score: 0.8678
Training Progress: 11.11%
Model: Lasso
- Mean Squared Error: 43.8834
- R2 Score: 0.8009
Training Progress: 22.22%
Model: Ridge
- Mean Squared Error: 29.0001
- R2 Score: 0.8684
Training Progress: 33.33%


Model: K-Neighbors Regressor
- Mean Squared Error: 34.0292
- R2 Score: 0.8456
Training Progress: 44.44%
Model: Decision Tree
- Mean Squared Error: 0.1176
- R2 Score: 0.9995
Training Progress: 55.56%
Model: Random Forest Regressor
- Mean Squared Error: 5.1365
- R2 Score: 0.9767
Training Progress: 66.67%
Model: XGBRegressor
- Mean Squared Error: 1.2184
- R2 Score: 0.9945
Training Progress: 77.78%
Model: CatBoosting Regressor
- Mean Squared Error: 10.0770
- R2 Score: 0.9543
Training Progress: 88.89%
Model: AdaBoost Regressor
- Mean Squared Error: 34.2818
- R2 Score: 0.8445
Training Progress: 100.00%
The best model 'Linear Regression' has been saved.

Best Model Scores:
- Mean Squared Error: 29.1296
- R2 Score: 0.8678


In [12]:
# try:

  

#     training_config = config.get_training_config()
#     training = Training(config=training_config)
#     training.train_generator()
#     training.train(
#         callback_list=callback_list
#     )
    
# except Exception as e:
#     raise e