In [1]:
import os

In [2]:
%pwd

'/home/karthik-111/professional/Data_Science/projects/CRSystem/research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'/home/karthik-111/professional/Data_Science/projects/CRSystem'

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    original_data: str
    distances_data: str 
    ds_cols : str
    Scaling : str

In [6]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories,load_bin,save_bin,save_bin_data,save_bin_array

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            original_data= config.original_data,
            distances_data= config.distances_data,
            ds_cols=config.ds_cols  , 
            Scaling = config.Scaling
        )

        return data_transformation_config


In [8]:
import os
from mlProject import logger
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from pathlib import Path
from typing import Any
import numpy as np

In [21]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.data = pd.read_csv(self.config.data_path)
        
    def original(self):
        save_bin_data(self.data, Path(os.path.join(self.config.root_dir, self.config.original_data)))
        Scaling= StandardScaler()
        Scaling.fit(self.data[["Total_Tuition_Cost"]])
        save_bin(Scaling,Path(os.path.join(self.config.root_dir, self.config.Scaling)))

    
    def transform(self):
        self.data["Online"]=self.data["Online"].apply(lambda x : 1 if x == "Online" else 0)
        self.data["Need_GRE"]=self.data["Need_GRE"].apply(lambda x : 1 if x == "Needed" else 0)
        self.data["Institution Type"]=self.data["Institution Type"].apply(lambda x : 1 if x == "Public" else 0)
        sc= StandardScaler()
        self.data[["Ranking","Total_Tuition_Cost","Median_Salary_10yr"]]=sc.fit_transform(self.data[["Ranking",
                                                                                       "Total_Tuition_Cost",
                                                                                       "Median_Salary_10yr"]])
    
    def distances(self):
        df  = self.data.copy()
        pd.set_option('future.no_silent_downcasting', True)
        df.drop(["LINK"],axis="columns",inplace=True)
        df.drop(["School Name"],axis="columns",inplace=True)
        df=pd.get_dummies(df)
        df.replace(False,0,inplace=True)
        df.replace(True,1,inplace=True)
        cos_sim = cosine_similarity(df)
        save_bin_array(cos_sim,Path(os.path.join(self.config.root_dir,self.config.distances_data)))
    
    def save_train_Data(self):
        df  = self.data.copy()
        df=df.drop(["State","Ranking","Min_Undergraduate_GPA",
                    "Median_Salary_10yr","LINK"],axis="columns")  
        dummies=pd.get_dummies(df.drop(["School Name"],axis="columns"))
        df = pd.concat([df["School Name"],dummies],axis="columns")
        df.replace(False,0,inplace=True)
        df.replace(True,1,inplace=True)  
        
        df.to_csv(os.path.join(self.config.root_dir, "train.csv"),index = False)

        logger.info("Saved data for training")
        ds_cols  =np.array(df.columns.to_list()[1:])
        save_bin_array(ds_cols,Path(os.path.join(self.config.root_dir,self.config.ds_cols)))

        logger.info("Saved columns names")

In [22]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
except Exception as e:
    raise e

[2024-03-25 11:22:27,500: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-03-25 11:22:27,507: INFO: common: yaml file: params.yaml loaded successfully]
[2024-03-25 11:22:27,512: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-03-25 11:22:27,514: INFO: common: created directory at: artifacts]
[2024-03-25 11:22:27,516: INFO: common: created directory at: artifacts/data_transformation]


In [25]:
data_transformation.original()


[2024-03-25 11:23:29,639: INFO: common: binary file saved at: artifacts/data_transformation/original.joblib]
[2024-03-25 11:23:29,655: INFO: common: binary file saved at: artifacts/data_transformation/Scaling.joblib]


Unnamed: 0,School Name,State,City,Ranking,Online,Total_Tuition_Cost,Program_Years_Full_Time,Min_Undergraduate_GPA,Median_Salary_10yr,Need_GRE,Institution Type,LINK
0,Columbia University,NY,New York City,5,Offline,68160,1.0,3.7,132100,Not Needed,Private,
1,University of Texas-Austin,TX,Austin,45,Online,10000,2.0,3.0,115600,Needed,Public,
2,University of Southern California,CA,Los Angeles,17,Offline,47880,1.5,3.5,126300,Not Needed,Private,http://www.marshall.usc.edu
3,University of Miami,FL,Miami,87,Offline,66390,1.5,3.0,101000,Not Needed,Private,http://www.bus.miami.edu
4,Syracuse University,NY,Syracuse,113,Online,58956,1.5,3.0,109100,Not Needed,Private,https://requestinfo.onlinebusiness.syr.edu
...,...,...,...,...,...,...,...,...,...,...,...,...
109,Georgia State University,GA,Atlanta,241,Offline,45000,1.0,3.0,90900,Not Needed,Public,http://specialized.robinson.gsu.edu
110,Pace University,NY,New York City,243,Offline,41250,1.0,3.0,111800,Not Needed,Private,
111,Pace University,NY,New York City,243,Online,31500,1.0,3.0,111800,Not Needed,Private,
112,Chapman University,CA,Orange,246,Offline,54095,2.0,3.0,102100,Needed,Private,http://www.chapman.edu


In [24]:
data_transformation.original()["LINK"]

[2024-03-25 11:22:39,566: INFO: common: binary file saved at: artifacts/data_transformation/original.joblib]
[2024-03-25 11:22:39,591: INFO: common: binary file saved at: artifacts/data_transformation/Scaling.joblib]


0                                             NaN
1                                             NaN
2                     http://www.marshall.usc.edu
3                        http://www.bus.miami.edu
4      https://requestinfo.onlinebusiness.syr.edu
                          ...                    
109           http://specialized.robinson.gsu.edu
110                                           NaN
111                                           NaN
112                        http://www.chapman.edu
113                                           NaN
Name: LINK, Length: 114, dtype: object

In [18]:
data_transformation.transform()

In [19]:
data_transformation.distances()

[2024-03-25 11:20:49,337: INFO: common: binary file saved at: artifacts/data_transformation/distances_Data.joblib]


In [20]:
data_transformation.save_train_Data()

[2024-03-25 11:20:49,780: INFO: 2166119786: Saved data for training]
[2024-03-25 11:20:49,785: INFO: common: binary file saved at: artifacts/data_transformation/ds_cols.joblib]
[2024-03-25 11:20:49,787: INFO: 2166119786: Saved columns names]
