# EDA on original dataset

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('styles.csv', on_bad_lines= 'skip')
df.head()
# Analysis 
# 1) Original csv file is erroneous because 22 rows (6044, 6569, 7399, 7939, 9026, 10264, 10427, 10905, 11373, 
#    11945, 14112, 14532, 15076, 29906, 31625, 33020, 35748, 35962, 37770, 38105, 38275, 38404) contain 
#    comma separated values in productDisplayName hence causing 1 extra column. 
#    The original csv file has 44,447 rows but images are 44,441    
# 2) has too many features, we only require id, articleType/subCategory, usage	

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,productDisplayName
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011.0,Casual,Turtle Check Men Navy Blue Shirt
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,2012.0,Casual,Peter England Men Party Blue Jeans
2,59263,Women,Accessories,Watches,Watches,Silver,Winter,2016.0,Casual,Titan Women Silver Watch
3,21379,Men,Apparel,Bottomwear,Track Pants,Black,Fall,2011.0,Casual,Manchester United Men Solid Black Track Pants
4,53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,2012.0,Casual,Puma Men Grey T-shirt


In [3]:
df.info() # After skipping bad lines df has 44,424 entries (44,447 - 22 -1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44424 entries, 0 to 44423
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  44424 non-null  int64  
 1   gender              44424 non-null  object 
 2   masterCategory      44424 non-null  object 
 3   subCategory         44424 non-null  object 
 4   articleType         44424 non-null  object 
 5   baseColour          44409 non-null  object 
 6   season              44403 non-null  object 
 7   year                44423 non-null  float64
 8   usage               44107 non-null  object 
 9   productDisplayName  44417 non-null  object 
dtypes: float64(1), int64(1), object(8)
memory usage: 3.4+ MB


In [4]:
df['gender'].value_counts()
# Analysis:
#           1) the DS has data of men, boys, women, girl, unisex. We can separate male DS by combining men, boys and unisex

gender
Men       22147
Women     18631
Unisex     2161
Boys        830
Girls       655
Name: count, dtype: int64

# Preprocessing of Original DS

## Dropping irrelevant features

In [5]:
df.drop(columns= ['masterCategory', 'subCategory', 'baseColour', 'season', 'year', 'productDisplayName'], axis=1, inplace=True)
df.head()

Unnamed: 0,id,gender,articleType,usage
0,15970,Men,Shirts,Casual
1,39386,Men,Jeans,Casual
2,59263,Women,Watches,Casual
3,21379,Men,Track Pants,Casual
4,53759,Men,Tshirts,Casual


## Null values

In [6]:
# Checking If there are null values in the dataset
print(f'General check for null values \n{df.isna().any()}\n')
print(f'Names of columns that contain null values: \n{df.columns[df.isna().any()].tolist()}\n')
print(f'Number of columns containing missing values: \n{len(df.columns[df.isna().any()].tolist())}\n')
print(f'Number of missing values in each column: \n{df.isna().sum()}')

General check for null values 
id             False
gender         False
articleType    False
usage           True
dtype: bool

Names of columns that contain null values: 
['usage']

Number of columns containing missing values: 
1

Number of missing values in each column: 
id               0
gender           0
articleType      0
usage          317
dtype: int64


In [7]:
# listing the columns with missing values 
missingVal = df.columns[df.isna().any()]
for i in missingVal:
    print(f'{i:<20}{df[i].isna().sum()}')

usage               317


In [8]:
# dropping Rows having empty values 
# inplace = True is used to make the changes in the original dataset
print(df.shape)
df.dropna(inplace=True)
print(f'Number of missing values in each column: \n{df.isna().sum()}')
print(df.shape) # 44424-317 = 44107

(44424, 4)
Number of missing values in each column: 
id             0
gender         0
articleType    0
usage          0
dtype: int64
(44107, 4)


In [9]:
df.to_csv('processedDS.csv', index=False)

In [10]:
df = pd.read_csv('processedDS.csv')
print(df.shape)
df.head()

(44107, 4)


Unnamed: 0,id,gender,articleType,usage
0,15970,Men,Shirts,Casual
1,39386,Men,Jeans,Casual
2,59263,Women,Watches,Casual
3,21379,Men,Track Pants,Casual
4,53759,Men,Tshirts,Casual


In [11]:
print(f'Number of missing values in each column: \n{df.isna().sum()}')

Number of missing values in each column: 
id             0
gender         0
articleType    0
usage          0
dtype: int64
