#ETL

## Importing necessary Libraries

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import ast
import os

## Loading the Dataset(Extract)

In [None]:
# Define the directory where your files are located
directory = '/content'

In [None]:
new_directory = '/content/cleaned_data'
os.makedirs(new_directory, exist_ok=True)

## Transforming the Data

In [None]:
# Function to clean HTML tags from a text
def clean_html(text):
    # Using BeautifulSoup to parse the HTML content
    soup = BeautifulSoup(text, "html.parser")

    # Get text without tags
    return soup.get_text(separator=" ", strip=True)


# Parse 'location' into 'city' and 'province' if not already done
# Assuming the format is 'City, Province' or 'Remote in City, Province'
def parse_location(location):
    if 'remote' in location.lower():
        return 'Remote', 'Remote'
    else:
        split_loc = location.split(',')
        if len(split_loc) == 2:
            return split_loc[0].strip(), split_loc[1].strip()
        else:
            return location, 'Unknown'

def convert_to_list(skills_str):
    try:
        return ast.literal_eval(skills_str)
    except (ValueError, SyntaxError):
        # In case the string is not a well-formatted list, return an empty list
        return []

In [None]:
# Function to Transform Data:
def transformData(data_scientist_df , filename):

  # Clean the 'description' column
  data_scientist_df['description_clean'] = data_scientist_df['description'].apply(clean_html)

  # Handle missing values in 'location'
  # We will fill missing values with a placeholder 'Unknown'
  data_scientist_df['location'].fillna('Unknown', inplace=True)

  # Apply the parsing function to 'location'
  data_scientist_df[['city', 'province']] = data_scientist_df.apply(
    lambda row: pd.Series(parse_location(row['location'])), axis=1
  )

  # Apply the conversion function to 'skills'
  data_scientist_df['skills_list'] = data_scientist_df['skills'].apply(convert_to_list)


  # Check the first few rows of the updated dataframe
  data_scientist_df.head()


  cleaned_filename = f'cleaned_{filename}'
  cleaned_file_path = os.path.join(new_directory, cleaned_filename)
  data_scientist_df.to_csv(cleaned_file_path, index=False)

In [None]:
# Iterate over each file in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):  # Assuming the files are CSV, change this based on your file type
        file_path = os.path.join(directory, filename)

        # Read the file (Modify this part based on your specific ETL process)
        data_scientist_df = pd.read_csv(file_path)

        # Transform the data (Apply your specific transformations here)
        transformData(data_scientist_df,filename)

## Load

In [None]:
# Define the path for the cleaned dataset
cleaned_file_path = 'Cleaned_Marketing_Manager.csv'

# Save the cleaned dataframe to a CSV file, excluding the index
data_scientist_df.to_csv(cleaned_file_path, index=False)

# Provide the path to the cleaned dataset
cleaned_file_path


'Cleaned_Marketing_Manager.csv'

In [None]:
df = pd.read_csv("/content/Cleaned_Data_Scientist.csv")

ParserError: ignored