In [None]:
import pandas as pd
import re

In [None]:
# Load Myntra dataset
myntra = pd.read_csv('myntra_dataset.csv')

In [None]:
myntra

In [None]:
myntra.shape

In [None]:
# Remove duplicate entries based on title from myntra
myntra = myntra.drop_duplicates(subset=['title'])
myntra.shape

In [None]:
# All the features of the dataset
features = myntra.columns
print(features)

In [None]:
# To check if the dataset has any null values or not
myntra.isnull().sum()

In [None]:
myntra['product_id'].value_counts()

In [None]:
# Drop duplicate products based on product_id
myntra = myntra.drop_duplicates(subset=["product_id"])
myntra.shape

In [None]:
myntra['inventory'][1]

In [None]:
# Firstly Fill every null values with ""
myntra.fillna({"inventory": ""}, inplace=True)

# If string has length less than 2 = replace it with " " in inventory
def updateInventory(x):
    if (len(x) < 2):
        x = " "
    return x

myntra['inventory'] = myntra['inventory'].apply(updateInventory)
myntra['inventory']

In [None]:
# Split the string and then replace all numvers and regular expressions with " " and return preprocessed string which can bes used in model training
def preprocessInventory(x):
    ls = x.split(" | ")
    str = " ".join(ls)

    cleanString = re.sub('\W+', ' ', str).strip()
    return cleanString.lower()

myntra['Pre_Inventory'] = myntra['inventory'].apply(preprocessInventory)


# for getting all values separated by "/" and use it into model
def preprocessType(x):
    ls = x.split("/")
    str = ' '.join(ls).strip()
    return str.lower()

myntra['type'] = myntra['type'].apply(preprocessType)


# Remove numbers and regular expressions from the string and preprocess it
def preprocessBody(x):
    cleanString = re.sub(r"[^A-Za-z]+", ' ', x)
    return cleanString.lower()

myntra['preprocess_body'] = myntra['body'].apply(preprocessBody)


def preprocessCareInstruction(x):
    if isinstance(x, str):
        x = x.replace(u'\xa0', u' ')
        x = x.replace('-', '')

        ls = x.split(" | ")
        str_ = " ".join(ls)

        cleanString = re.sub('\W+', ' ', str_).strip()
        return cleanString.lower()
    elif isinstance(x, (float, int)):
        # Convert float or integer to string and preprocess
        return preprocessCareInstruction(str(x))

myntra['care_instructions'] = myntra['care_instructions'].apply(preprocessCareInstruction)

In [None]:
myntra['type'][0]

In [None]:
myntra['preprocess_body'][0]

In [None]:
myntra['care_instructions'][2]

In [None]:
# Fill every null values with ""
myntra.fillna({'care_instructions': ""}, inplace=True)
myntra.fillna({'dominant_material': ""}, inplace=True)
myntra.fillna({'dominant_color': ""}, inplace=True)
myntra.fillna({'actual_color': ""}, inplace=True)
myntra.fillna({'specifications': ""}, inplace=True)

In [None]:
# Multiple images are there so covert them into list
def image_process(x):
    x = x.split(" | ")
    return x

myntra['images'] = myntra['images'].apply(image_process)
myntra['images'][0]

In [None]:
# remove all data with less than 4 images
myntra = myntra[myntra['images'].apply(lambda x : len(x) > 3)]

In [None]:
myntra.head()

In [None]:
# Drop Unnecessery column
myntra.drop('Unnamed: 25', axis=1, inplace=True)

# Remove all null values and reset index
myntra.dropna(inplace=True)
myntra.reset_index(inplace=True)

In [None]:
# Set Product_id = index
myntra['product_id'] = myntra['index']

In [None]:
myntra = myntra.drop_duplicates(subset=['title'])

In [None]:
myntra.shape[0]

In [None]:
myntra['img1'] = myntra['images'].apply(lambda x: x[0])
myntra['img2'] = myntra['images'].apply(lambda x: x[1])
myntra['img3'] = myntra['images'].apply(lambda x: x[2])
myntra['img4'] = myntra['images'].apply(lambda x: x[3])

In [None]:
myntra.columns

In [None]:
# Creating corpus of data for model training
myntra['corpusData'] = myntra['size']+ " "  + myntra['care_instructions']+ " " + myntra['dominant_material']+ " " + myntra['actual_color'] + " " + myntra['dominant_color'] + " " + myntra['product_type']+ " " + myntra['preprocess_body'] + " " + myntra['type'] + " " + myntra['ideal_for'] + " " + myntra['Pre_Inventory']

In [None]:
myntra.columns

In [None]:
myntra

In [None]:
# Remove columns
drop_columns = ['index', 'crawl_timestamp', 'uniq_id', 'variant_sku', 'link', 'care_instructions', 'is_in_stock', 'images', 'size_fit', 'specifications']
myntra.drop(drop_columns, axis=1, inplace=True)
myntra.reset_index(drop=True, inplace=True)

In [None]:
# Store updated data into myntra.csv file
myntra.to_csv('processed_data/clean_myntra.csv')