In [1]:
import os

In [2]:
%pwd

'c:\\Users\\vloke\\Documents\\My_Project\\E-learning_recommender_system\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\vloke\\Documents\\My_Project\\E-learning_recommender_system'

## 1.Updated Entity

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConig:
    root_dir: Path
    raw_data: str
    fet_eng_data: str
    text_preprocess_data: str
    final_data: str
    tf_idf_vectorizer: str 
    transformed_data: str

##2.Update the Configuration manager in src Config

In [6]:
from E_learning_recommender_system.constants import CONFIG_FILE_PATH
from E_learning_recommender_system.utils.common import read_yaml,create_directories

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vloke\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vloke\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
class ConfigurationManger:
    def __init__(self,config_filepath = CONFIG_FILE_PATH):

        self.config = read_yaml(config_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self)->DataTransformationConig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConig(
            root_dir=config.root_dir,
            raw_data=config.raw_data,
            fet_eng_data = config.fet_eng_data,
            text_preprocess_data = config.text_preprocess_data,
            final_data = config.final_data,
            tf_idf_vectorizer  = config.tf_idf_vectorizer,
            transformed_data = config.transformed_data
        )       
        
        return data_transformation_config

## 3.Update the Components

In [8]:
import pandas as pd
import joblib
from E_learning_recommender_system.logging import logging
from E_learning_recommender_system.utils.common import preprocess_text
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
class DataTransformation:
    def __init__(self, config: DataTransformationConig):
        self.config = config
        
    def feature_engineering(self):
        if not os.path.exists(self.config.fet_eng_data):
            logging.info(f">>>>> Feture Engneering is started <<<<<<")

            df = pd.read_csv(self.config.raw_data)
            # Calculate average rating for each course
            average_rating = df.groupby('course_id')['rating'].mean().reset_index()
            average_rating.rename(columns={'rating': 'Average rating'}, inplace=True)
            average_rating['Average rating'] = average_rating['Average rating'].round(1)

            # Calculate number of reviews for each course
            num_reviews = df.groupby('course_id')['rating'].count().reset_index()
            num_reviews.rename(columns={'rating': 'Number of reviews'}, inplace=True)

            # Calculate popularity based on the number of reviews
            popularity = num_reviews.copy()
            popularity['Popularity'] = pd.qcut(popularity['Number of reviews'], q=4, labels=['Low', 'Medium', 'High', 'Very High'])

            # Select the desired columns from the original DataFrame
            selected_columns = ['course_id', 'course_name', 'difficulty', 'description', 'learning', 'tags']
            df_selected = df[selected_columns].drop_duplicates()

            # Merge the calculated metrics with the selected columns
            result_df = df_selected.merge(average_rating, on='course_id', how='left')
            result_df = result_df.merge(num_reviews, on='course_id', how='left')
            result_df = result_df.merge(popularity[['course_id', 'Popularity']], on='course_id', how='left')

            # Saving feture Engineering Data

            result_df.to_csv(self.config.fet_eng_data,index=False)

            logging.info(f">>>>>> Feture Engneering is completed and saved in {self.config.fet_eng_data}")

    def text_preprocessing(self):
        if not os.path.exists(self.config.text_preprocess_data):
            logging.info(">>>>> Text Preprocessing is started <<<<<<")
            # Reading fet_eng_files
            df_new = pd.read_csv(self.config.fet_eng_data)
            # Apply text preprocessing to your columns
            df_new['tags_preprocessed'] = df_new['tags'].apply(preprocess_text)
            df_new['description_preprocessed'] = df_new['description'].apply(preprocess_text)
            df_new['course_name_preprocessed'] = df_new['course_name'].apply(preprocess_text)

            # Optionally, you can combine the preprocessed columns into a single column
            df_new['combined_text'] = df_new['tags_preprocessed'] + ' ' + df_new['description_preprocessed'] + ' ' + df_new['course_name_preprocessed']

            # If you want to drop the preprocessed columns, you can use the following code:
            df_new.drop(['tags_preprocessed', 'description_preprocessed', 'course_name_preprocessed'], axis=1, inplace=True)

            # Save the preprocessed dataset if needed
            df_new.to_csv(self.config.text_preprocess_data, index=False)

            logging.info(f">>>>>> Text Preprocessing is Completed and saved in {self.config.text_preprocess_data}")

    def word_embedding(self):
        if not os.path.exists(self.config.transformed_data):
            logging.info(">>>>>>Word Embending is started with tf-idf Vectorizer<<<<<<")

            #Reading Text_preprocess_data
            df = pd.read_csv(self.config.text_preprocess_data)

            tfidf_vectorizer = TfidfVectorizer()
            tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_text'])

            # Save the TfidfVectorizer to disk
            joblib.dump(tfidf_vectorizer, self.config.tf_idf_vectorizer)

            # Save the transformed data to disk
            joblib.dump(tfidf_matrix, self.config.transformed_data)

            logging.info(f"TF-IDF vectorizer and matrix  as saved to {self.config.tf_idf_vectorizer}and{self.config.transformed_data}")
        else:
            logging.info(f"TF-IDF matrix already exists at {self.config.transformed_data}")

    def final_data(self):
        if not os.path.exists(self.config.final_data):
            logging.info(">>>>>>Removing Un-necessary features<<<<<<")
            df = pd.read_csv(self.config.fet_eng_data)

            # Adding required features only
            req_features = ['course_name','description','learning','difficulty','Average rating','Popularity']
            final = df[req_features]

            final.to_csv(self.config.final_data,index=False)

            logging.info(">>>>>>final data-set as saved<<<<<<")    

## 4. Update the Pipeline

In [10]:
try:
    config = ConfigurationManger()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.feature_engineering()
    data_transformation.text_preprocessing()
    data_transformation.word_embedding()
    data_transformation.final_data()
except Exception as e:
    raise e