In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Basics

In [None]:
# Step 1: Import pandas
import pandas as pd

# Step 3: Load dataset (update path if different)
file_path = "/content/drive/MyDrive/Women's Apparel Business/Fashion Dataset.csv"
df = pd.read_csv(file_path)

# Step 4: Explore dataset
print("Shape of dataset:", df.shape)   # rows, columns
print("\nFirst 5 rows:\n", df.head())
print("\nColumn names:", df.columns)

# Step 5: Check for missing values
print("\nMissing Values:\n", df.isnull().sum())

# Step 6: Summary stats for numeric columns
print("\nSummary Stats:\n", df.describe())

# Step 7: Quick look at unique values
print("\nUnique brands:", df['brand'].nunique())
print("Unique colours:", df['colour'].nunique())
print("Unique product names:", df['name'].nunique())


Shape of dataset: (14330, 11)

First 5 rows:
    Unnamed: 0        p_id                                               name  \
0           0  17048614.0  Khushal K Women Black Ethnic Motifs Printed Ku...   
1           1  16524740.0  InWeave Women Orange Solid Kurta with Palazzos...   
2           2  16331376.0  Anubhutee Women Navy Blue Ethnic Motifs Embroi...   
3           3  14709966.0  Nayo Women Red Floral Printed Kurta With Trous...   
4           4  11056154.0   AHIKA Women Black & Green Printed Straight Kurta   

    price     colour      brand  \
0  5099.0      Black  Khushal K   
1  5899.0     Orange    InWeave   
2  4899.0  Navy Blue  Anubhutee   
3  3699.0        Red       Nayo   
4  1350.0      Black      AHIKA   

                                                 img  ratingCount  avg_rating  \
0  http://assets.myntassets.com/assets/images/170...       4522.0    4.418399   
1  http://assets.myntassets.com/assets/images/165...       1081.0    4.119334   
2  http://assets.my

lowercasing the values

In [None]:
import pandas as pd
import re
import nltk

# Download required nltk resources
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Load dataset
file_path = "/content/drive/MyDrive/Women's Apparel Business/Fashion Dataset.csv"
df = pd.read_csv(file_path)

# Initialize NLP tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Preprocessing function
def clean_text(text, do_stem=True, do_lemma=True):
    if pd.isnull(text):
        return ""
    # Lowercase
    text = text.lower()
    # Remove punctuation, numbers, special characters
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize
    words = nltk.word_tokenize(text)
    # Remove stopwords
    words = [w for w in words if w not in stop_words]
    # Apply stemming
    if do_stem:
        words = [stemmer.stem(w) for w in words]
    # Apply lemmatization
    if do_lemma:
        words = [lemmatizer.lemmatize(w) for w in words]
    return " ".join(words)

# Apply cleaning to textual columns
text_columns = ['name', 'colour', 'brand', 'description']
for col in text_columns:
    df[col] = df[col].astype(str).apply(clean_text)

# Preview processed data
print("✅ Preprocessing complete!")
print(df.head())

# Save cleaned dataset back to Drive
output_path = "/content/drive/MyDrive/Women's Apparel Business/Fashion Dataset Cleaned.csv"
df.to_csv(output_path, index=False)
print(f"📂 Cleaned dataset saved to: {output_path}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


✅ Preprocessing complete!
   Unnamed: 0        p_id                                               name  \
0           0  17048614.0  khushal k woman black ethnic motif print kurta...   
1           1  16524740.0  inweav woman orang solid kurta palazzo floral ...   
2           2  16331376.0  anubhute woman navi blue ethnic motif embroid ...   
3           3  14709966.0  nayo woman red floral print kurta trouser dupatta   
4           4  11056154.0       ahika woman black green print straight kurta   

    price     colour      brand  \
0  5099.0      black  khushal k   
1  5899.0      orang     inweav   
2  4899.0  navi blue   anubhute   
3  3699.0        red       nayo   
4  1350.0      black      ahika   

                                                 img  ratingCount  avg_rating  \
0  http://assets.myntassets.com/assets/images/170...       4522.0    4.418399   
1  http://assets.myntassets.com/assets/images/165...       1081.0    4.119334   
2  http://assets.myntassets.com/assets/

Which words appear the most-what women wear?

In [None]:
import pandas as pd
from collections import Counter

# Load the cleaned dataset
path = "/content/drive/MyDrive/Women's Apparel Business/Fashion Dataset Cleaned.csv"
df = pd.read_csv(path)

total_rows = len(df)

# Tokenize words from the 'name' column
all_words = []
row_counts = {}

for idx, text in enumerate(df['name'].astype(str)):
    words = text.split()
    unique_words = set(words)  # to count row-wise appearances
    all_words.extend(words)
    for w in unique_words:
        row_counts[w] = row_counts.get(w, 0) + 1

# Word frequency across all rows
word_freq = Counter(all_words)

# Create dataframe for ranking
word_rank = pd.DataFrame({
    'word': list(word_freq.keys()),
    'total_count': list(word_freq.values()),          # total appearances
    'row_count': [row_counts[w] for w in word_freq]   # number of rows it appears in
})

# Add ratio & percentage
word_rank['row_ratio'] = word_rank['row_count'] / total_rows
word_rank['row_percentage'] = word_rank['row_ratio'] * 100

# Sort by frequency
word_rank = word_rank.sort_values(by="total_count", ascending=False).reset_index(drop=True)

# Add rank column
word_rank['rank'] = word_rank.index + 1

# Reorder columns for clarity
word_rank = word_rank[['rank', 'word', 'total_count', 'row_count', 'row_ratio', 'row_percentage']]

# Show top 20
print(word_rank.head(100))

# Save ranked words to CSV
output_path = "/content/drive/MyDrive/Women's Apparel Business/Word_Rankings.csv"
word_rank.to_csv(output_path, index=False)
print(f"📂 Word rankings with percentages saved to: {output_path}")


    rank    word  total_count  row_count  row_ratio  row_percentage
0      1   woman         9206       9157   0.639009       63.900907
1      2   print         3367       3339   0.233008       23.300768
2      3    blue         3211       3197   0.223098       22.309839
3      4   solid         2565       2564   0.178925       17.892533
4      5   black         2254       2252   0.157153       15.715283
..   ...     ...          ...        ...        ...             ...
95    96    teal          207        207   0.014445        1.444522
96    97  thread          207        203   0.014166        1.416609
97    98  tailor          205        205   0.014306        1.430565
98    99  mitera          204        204   0.014236        1.423587
99   100    look          204        204   0.014236        1.423587

[100 rows x 6 columns]
📂 Word rankings with percentages saved to: /content/drive/MyDrive/Women's Apparel Business/Word_Rankings.csv


Which words occur frequently together

In [None]:
import pandas as pd
from collections import Counter
from itertools import combinations

# Load cleaned dataset
path = "/content/drive/MyDrive/Women's Apparel Business/Fashion Dataset Cleaned.csv"
df = pd.read_csv(path)

total_rows = len(df)

# Store co-occurrence counts
pair_counts = Counter()

for text in df['name'].astype(str):
    words = text.split()
    unique_words = set(words)  # avoid duplicate counts in one row

    # Exclude "woman" from analysis
    if "woman" in unique_words:
        unique_words.remove("woman")

    # Generate word pairs for this row
    for combo in combinations(sorted(unique_words), 2):
        pair_counts[combo] += 1

# Convert to DataFrame
pair_df = pd.DataFrame(pair_counts.items(), columns=['word_pair', 'row_count'])

# Compute row ratio & percentage
pair_df['row_ratio'] = pair_df['row_count'] / total_rows
pair_df['row_percentage'] = pair_df['row_ratio'] * 100

# Split pairs into two columns
pair_df[['word1', 'word2']] = pd.DataFrame(pair_df['word_pair'].tolist(), index=pair_df.index)

# Reorder columns
pair_df = pair_df[['word1', 'word2', 'row_count', 'row_ratio', 'row_percentage']]

# Sort by frequency
pair_df = pair_df.sort_values(by='row_count', ascending=False).reset_index(drop=True)

# Show top 20 co-occurring word pairs
print(pair_df.head(20))

# Save results
output_path = "/content/drive/MyDrive/Women's Apparel Business/Word_CoOccurrences.csv"
pair_df.to_csv(output_path, index=False)
print(f"📂 Co-occurring word pairs with row percentages saved to: {output_path}")


      word1       word2  row_count  row_ratio  row_percentage
0    cotton        pure       1007   0.070272        7.027216
1      blue        navi        900   0.062805        6.280530
2     blous     lehenga        759   0.052966        5.296581
3      blue       print        757   0.052826        5.282624
4     print       white        748   0.052198        5.219819
5   dupatta     lehenga        745   0.051989        5.198883
6    ethnic       motif        710   0.049546        4.954641
7     blous     dupatta        702   0.048988        4.898814
8   dupatta     embroid        680   0.047453        4.745290
9       fit        jean        677   0.047244        4.724355
10    dress      materi        632   0.044103        4.410328
11     blue        jean        632   0.044103        4.410328
12   floral       print        598   0.041731        4.173064
13     blue         fit        587   0.040963        4.096301
14    print         top        575   0.040126        4.012561
15  lehe

3 words occuring together

In [None]:
import pandas as pd
from collections import Counter
from itertools import combinations

# Load cleaned dataset
path = "/content/drive/MyDrive/Women's Apparel Business/Fashion Dataset Cleaned.csv"
df = pd.read_csv(path)

total_rows = len(df)

# Store trigram counts
trigram_counts = Counter()

for text in df['name'].astype(str):
    words = text.split()
    unique_words = set(words)  # avoid duplicates in the same row

    # Exclude "woman" from analysis
    if "woman" in unique_words:
        unique_words.remove("woman")

    # Generate all possible 3-word combinations
    for combo in combinations(sorted(unique_words), 3):
        trigram_counts[combo] += 1

# Convert to DataFrame
trigram_df = pd.DataFrame(trigram_counts.items(), columns=['word_trigram', 'row_count'])

# Compute ratio & percentage
trigram_df['row_ratio'] = trigram_df['row_count'] / total_rows
trigram_df['row_percentage'] = trigram_df['row_ratio'] * 100

# Split trigram into 3 columns
trigram_df[['word1', 'word2', 'word3']] = pd.DataFrame(trigram_df['word_trigram'].tolist(), index=trigram_df.index)

# Reorder columns
trigram_df = trigram_df[['word1', 'word2', 'word3', 'row_count', 'row_ratio', 'row_percentage']]

# Sort by frequency
trigram_df = trigram_df.sort_values(by='row_count', ascending=False).reset_index(drop=True)

# Show top 20 trigrams
print(trigram_df.head(20))

# Save results
output_path = "/content/drive/MyDrive/Women's Apparel Business/Word_Trigrams.csv"
trigram_df.to_csv(output_path, index=False)
print(f"📂 Trigrams with row percentages saved to: {output_path}")


      word1       word2       word3  row_count  row_ratio  row_percentage
0     blous     dupatta     lehenga        700   0.048849        4.884857
1     dress      materi    unstitch        555   0.038730        3.872994
2     blous     lehenga    unstitch        554   0.038660        3.866015
3   dupatta     lehenga    unstitch        536   0.037404        3.740405
4     blous     dupatta    unstitch        521   0.036357        3.635729
5     blous     lehenga  semistitch        507   0.035380        3.538032
6   dupatta     lehenga  semistitch        491   0.034264        3.426378
7     blous     dupatta  semistitch        476   0.033217        3.321703
8   lehenga  semistitch    unstitch        469   0.032729        3.272854
9     blous  semistitch    unstitch        457   0.031891        3.189114
10     blue         fit        jean        434   0.030286        3.028611
11  dupatta  semistitch    unstitch        434   0.030286        3.028611
12   cotton       print        pure   

To identify clothes, print the 500 most occuring words

In [None]:
import pandas as pd
from collections import Counter
import re

# Load dataset
path = "/content/drive/MyDrive/Women's Apparel Business/Fashion Dataset Cleaned.csv"
df = pd.read_csv(path)

# Clean and tokenize words from 'name' column
all_words = []
for text in df['name'].dropna().astype(str):
    words = re.findall(r'\b\w+\b', text.lower())  # extract words
    all_words.extend(words)

# Count word frequencies
word_counts = Counter(all_words)

# Get top 500 most common words
top_500_words = [word for word, count in word_counts.most_common(500)]

# Show first 50 to check
print(top_500_words[:500])

# Save full list
print("\nTop 500 words collected into 'top_500_words' list.")


['woman', 'print', 'blue', 'solid', 'black', 'dupatta', 'white', 'green', 'top', 'pink', 'cotton', 'embroid', 'fit', 'pure', 'trouser', 'red', 'skirt', 'unstitch', 'jean', 'sare', 'floral', 'jacket', 'lehenga', 'kurta', 'ethnic', 'navi', 'yellow', 'blous', 'design', 'grey', 'jumpsuit', 'motif', 'crop', 'dress', 'highris', 'palazzo', 'regular', 'beig', 'semistitch', 'materi', 'goldton', 'flare', 'silk', 'maroon', 'short', 'sweatshirt', 'stripe', 'basic', 'woven', 'stretchabl', 'orang', 'straight', 'alin', 'brown', 'shrug', 'denim', 'embellish', 'mustard', 'purpl', 'work', 'roadster', 'girl', 'sassafra', 'wear', 'lifestyl', 'skinni', 'peachcolour', 'pleat', 'readi', 'sequin', 'slim', 'kurti', 'zari', 'tokyo', 'talki', 'fade', 'hood', 'set', 'longlin', 'mango', 'oliv', 'style', 'creation', 'clora', 'urban', 'pullov', 'neck', 'maxi', 'blend', 'light', 'golden', 'front', 'coord', 'colourblock', 'thread', 'teal', 'fashion', 'tailor', 'look', 'mitera', 'sleev', 'hm', 'clean', 'check', 'mini',

name with only apparel name

In [None]:
import pandas as pd
import re

# Load dataset
path = "/content/drive/MyDrive/Women's Apparel Business/Fashion Dataset Cleaned.csv"
df = pd.read_csv(path)

# Example list of apparel-related words (replace with your finalized list of top 500 / cleaned keywords)
apparel_keywords = [
    "dupatta","top","trouser","skirt","jean","sare","jacket","lehenga","kurta",
    "blous","jumpsuit","dress","palazzo","kurti","set","maxi","tunic","culott",
    "choli","shirt","sweater","blazer","cardigan","shawl","wrap","playsuit",
    "pencil","bomber","salwar","sharara","anarkali","tshirt","pant","smock",
    "coat","short","shrug","sweatshirt","leggings","gown","poncho","kaftan",
    "dhoti","vest","tank","jogger","hood","nightwear","pyjama","jumpsuit",
    "blouson","outfitt","poncho"
]

# Convert list to set for faster lookup
apparel_set = set(apparel_keywords)

def extract_apparel(text):
    if pd.isna(text):
        return ""
    words = re.findall(r'\b\w+\b', text.lower())  # tokenize
    apparel_words = [w for w in words if w in apparel_set]
    return " ".join(apparel_words)

# Apply function to create new column
df["apparel_name"] = df["name"].apply(extract_apparel)

# Save updated dataset
output_path = "/content/drive/MyDrive/Women's Apparel Business/Fashion Dataset With Apparel.csv"
df.to_csv(output_path, index=False)

print("✅ New column 'apparel_name' created and dataset saved at:", output_path)


✅ New column 'apparel_name' created and dataset saved at: /content/drive/MyDrive/Women's Apparel Business/Fashion Dataset With Apparel.csv


Now the Clothes and their frequency

In [None]:
import pandas as pd
from collections import Counter
from nltk.util import ngrams
import nltk

# Download tokenizer (only once)
nltk.download('punkt')

# Load dataset
path = "/content/drive/MyDrive/Women's Apparel Business/Fashion Dataset With Apparel.csv"
df = pd.read_csv(path)

# Ensure apparel_name column exists
if 'apparel_name' not in df.columns:
    raise ValueError("The 'apparel_name' column is missing. Please create it first.")

# Tokenize the apparel_name column
df['tokens'] = df['apparel_name'].astype(str).apply(lambda x: nltk.word_tokenize(x.lower()))

# Flatten all tokens
all_tokens = [token for tokens in df['tokens'] for token in tokens]

# ---- UNIGRAMS ----
unigram_counts = Counter(all_tokens)
unigram_df = pd.DataFrame(unigram_counts.items(), columns=['Unigram', 'Frequency'])
unigram_df['Row_Percentage'] = unigram_df['Frequency'] / len(df) * 100
unigram_df = unigram_df.sort_values(by='Frequency', ascending=False)

# ---- BIGRAMS ----
all_bigrams = [bigram for tokens in df['tokens'] for bigram in ngrams(tokens, 2)]
bigram_counts = Counter(all_bigrams)
bigram_df = pd.DataFrame(bigram_counts.items(), columns=['Bigram', 'Frequency'])
bigram_df['Bigram'] = bigram_df['Bigram'].apply(lambda x: ' '.join(x))
bigram_df['Row_Percentage'] = bigram_df['Frequency'] / len(df) * 100
bigram_df = bigram_df.sort_values(by='Frequency', ascending=False)

# ---- TRIGRAMS ----
all_trigrams = [trigram for tokens in df['tokens'] for trigram in ngrams(tokens, 3)]
trigram_counts = Counter(all_trigrams)
trigram_df = pd.DataFrame(trigram_counts.items(), columns=['Trigram', 'Frequency'])
trigram_df['Trigram'] = trigram_df['Trigram'].apply(lambda x: ' '.join(x))
trigram_df['Row_Percentage'] = trigram_df['Frequency'] / len(df) * 100
trigram_df = trigram_df.sort_values(by='Frequency', ascending=False)

# Display top results
print("Top 20 Unigrams:")
print(unigram_df.head(50))
print("\nTop 20 Bigrams:")
print(bigram_df.head(50))
print("\nTop 20 Trigrams:")
print(trigram_df.head(50))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Top 20 Unigrams:
       Unigram  Frequency  Row_Percentage
2      dupatta       2113       14.745290
6          top       1694       11.821354
3      trouser       1213        8.464759
7        skirt       1145        7.990230
31        jean       1058        7.383112
45        sare       1041        7.264480
14      jacket        985        6.873692
35     lehenga        982        6.852756
0        kurta        926        6.461968
24       blous        885        6.175855
37    jumpsuit        743        5.184927
30       dress        696        4.856943
1      palazzo        676        4.717376
29       short        571        3.984648
34  sweatshirt        546        3.810188
16         nan        418        2.916957
43       shrug        398        2.777390
9        kurti        294        2.051640
28        hood        284        1.981856
5          set        281        1.960921
19        maxi        228        1.591068
44       tunic        186        1.297976
36      culott   

Colour Frequency and Max Rating Count

In [None]:
import pandas as pd

# File path
path = "/content/drive/MyDrive/Women's Apparel Business/Fashion Dataset With Apparel.csv"

# Load dataset
df = pd.read_csv(path)

# Drop rows with missing colour or ratingCount
df = df.dropna(subset=["colour", "ratingCount"])

# Ensure ratingCount is numeric
df["ratingCount"] = pd.to_numeric(df["ratingCount"], errors="coerce").fillna(0)

# Q1: Frequency of each colour
colour_frequency = df["colour"].value_counts().reset_index()
colour_frequency.columns = ["colour", "frequency"]

# Q2: Review count per colour
colour_reviews = df.groupby("colour")["ratingCount"].sum().reset_index()
colour_reviews.columns = ["colour", "total_reviews"]

# Merge both results
colour_stats = pd.merge(colour_frequency, colour_reviews, on="colour")

# Sort by frequency first, then reviews (optional)
colour_stats = colour_stats.sort_values(by=["frequency", "total_reviews"], ascending=False)

# Print all results
print("Colour Frequency & Review Count (All Colours):")
print(colour_stats.to_string(index=False))


Colour Frequency & Review Count (All Colours):
        colour  frequency  total_reviews
         black        958       253745.0
          blue        824       135203.0
          pink        526        96327.0
         white        522       101298.0
         green        480        81098.0
     navi blue        446        85748.0
           red        328        53785.0
        maroon        253        47239.0
          grey        252        25043.0
          beig        188        21172.0
       mustard        180        41630.0
        yellow        173        45862.0
          oliv        145        30186.0
         peach        127        18962.0
         purpl        123        13528.0
         orang        111         9188.0
         brown         96        15560.0
      burgundi         90        26770.0
          teal         89        16836.0
         multi         62         8072.0
          rust         59         7745.0
         cream         57         6793.0
     sea g

attributes break down - lowercasing the attributes and lowercasing

In [None]:
#import pandas as pd

#df = pd.read_csv(filepath)
#filepath = ''

#attributes = df.getdict('p_attributes')

attributes = {"Name": "Deepankar", "AGE": 21, "CITY": "Delhi"}

#this is for only lowercasing the keys
attributes1 = {k.lower(): v for k,v in attributes.items()}
print(attributes1)


#this is for lowercasing both the the keys and the values
#attributes2 = {k.lower(): v.lower() for k, v in attributes.items()}
#print(attributes2)


#attributes2 = {k.lower(): (v.lower() if isinstance(v, str) else v) for k, v in data.items()}
#print(attributes2)

attributes2 = {k.lower() : (v.lower() if isinstance(v, str) else v) for k, v in attributes.items()}
print(attributes2)


{'name': 'Deepankar', 'age': 21, 'city': 'Delhi'}
{'name': 'deepankar', 'age': 21, 'city': 'delhi'}


lowercase the attributes

In [None]:
import pandas as pd
import ast

# File path
path = "/content/drive/MyDrive/Women's Apparel Business/Fashion Dataset With Apparel.csv"

# Load CSV
df = pd.read_csv(path)

# Function to lowercase dict keys & values
def lowercase_dict(attr_str):
    try:
        attr_dict = ast.literal_eval(attr_str) if isinstance(attr_str, str) else attr_str
        if isinstance(attr_dict, dict):
            return {str(k).lower(): str(v).lower() for k, v in attr_dict.items()}
        return {}
    except Exception:
        return {}

# Apply and create new column
df['attributes'] = df['p_attributes'].apply(lowercase_dict)

# Save back to same CSV
df.to_csv(path, index=False)

print("✅ File updated successfully with new 'attributes' column and saved at:", path)


✅ File updated successfully with new 'attributes' column and saved at: /content/drive/MyDrive/Women's Apparel Business/Fashion Dataset With Apparel.csv


Ranking the attributes

In [None]:
import pandas as pd
import ast

# Load dataset
path = "/content/drive/MyDrive/Women's Apparel Business/Fashion Dataset With Apparel.csv"
df = pd.read_csv(path)

# Convert attributes column to lowercase safely
df['attributes'] = df['attributes'].astype(str).str.lower()

# Function to safely parse dictionary strings
def parse_dict(attr_str):
    try:
        return ast.literal_eval(attr_str)
    except (ValueError, SyntaxError):
        return {}

# Parse dictionary in each row
df['attributes_dict'] = df['attributes'].apply(parse_dict)

# Collect all keys
all_keys = []
for d in df['attributes_dict']:
    all_keys.extend(list(d.keys()))

# Count frequencies of keys
attribute_freq = pd.Series(all_keys).value_counts().reset_index()
attribute_freq.columns = ['attribute_key', 'count']

# Add percentage column (relative to total rows)
total_rows = len(df)
attribute_freq['percentage'] = (attribute_freq['count'] / total_rows) * 100

# Display top 20 attributes
print(attribute_freq.head(50))

# Save to CSV
output_path = "/content/drive/MyDrive/Women's Apparel Business/Attribute_Key_Frequency.csv"
attribute_freq.to_csv(output_path, index=False)

print(f"\nAttribute key frequency ranking saved to: {output_path}")


            attribute_key  count  percentage
0                occasion  13213   92.205164
1               wash care  13103   91.437544
2             sustainable  13010   90.788555
3    body or garment size  10959   76.475925
4                 pattern  10506   73.314724
5                  fabric  10426   72.756455
6   print or pattern type  10037   70.041870
7                 closure   8528   59.511514
8       number of pockets   8443   58.918353
9                  length   8287   57.829728
10             main trend   7901   55.136078
11          sleeve length   7807   54.480112
12                   type   7636   53.286811
13              character   6869   47.934403
14        surface styling   6322   44.117237
15                   neck   6321   44.110258
16          body shape id   6286   43.866015
17             weave type   5351   37.341242
18                add-ons   5173   36.099093
19                hemline   5048   35.226797
20          ornamentation   4734   33.035590
21        

VERY IMPORTANT CODE BELOW

for each key find the value with the maximum occurance

pattern

In [None]:
import pandas as pd
import ast

# Example: assume df has columns ['attributes', 'ratingCount', 'avg_rating']

# If attributes are strings like "{'key': 'value', ...}", safely convert to dict
df['attributes'] = df['attributes'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

def analyze_key(df, key):
    # Extract the value for the given key from attributes dict
    df['attr_value'] = df['attributes'].apply(lambda x: x.get(key, None))

    # Group by that key's value
    result = df.groupby('attr_value').agg(
        frequency=('attr_value', 'count'),
        total_ratingCount=('ratingCount', 'sum'),
        avg_of_avgRating=('avg_rating', 'mean')
    ).reset_index()

    # Rank by total_ratingCount
    result = result.sort_values(by='total_ratingCount', ascending=False).reset_index(drop=True)

    return result

# Example usage:
result = analyze_key(df, 'pattern')
print(result)

       attr_value  frequency  total_ratingCount  avg_of_avgRating
0           solid       4568           377585.0          4.113941
1         printed       2970           325589.0          4.108126
2    woven design        560            28523.0          4.173461
3         striped        477            27382.0          4.103750
4     self design        491            26606.0          4.123608
5     embroidered        698            26187.0          4.128647
6         checked        198            15881.0          4.114544
7     yoke design         48            12212.0          4.145740
8     embellished        165            10518.0          3.977014
9   colourblocked        192             5825.0          4.214125
10         washed         28             3830.0          4.298065
11           dyed         60             2991.0          4.029270
12         ribbed         27             1666.0          4.308060
13     hem design         21              534.0          3.979945
14        

fabric

In [None]:
import pandas as pd
import ast

# Example: assume df has columns ['attributes', 'ratingCount', 'avg_rating']

# If attributes are strings like "{'key': 'value', ...}", safely convert to dict
df['attributes'] = df['attributes'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

def analyze_key(df, key):
    # Extract the value for the given key from attributes dict
    df['attr_value'] = df['attributes'].apply(lambda x: x.get(key, None))

    # Group by that key's value
    result = df.groupby('attr_value').agg(
        frequency=('attr_value', 'count'),
        total_ratingCount=('ratingCount', 'sum'),
        avg_of_avgRating=('avg_rating', 'mean')
    ).reset_index()

    # Rank by total_ratingCount
    result = result.sort_values(by='total_ratingCount', ascending=False).reset_index(drop=True)

    return result

# Example usage:
result = analyze_key(df, 'fabric')
print(result)

             attr_value  frequency  total_ratingCount  avg_of_avgRating
0                cotton       3270           383357.0          4.110177
1             polyester       2666           189023.0          4.111610
2         viscose rayon       1356            85925.0          4.117371
3           pure cotton        634            51933.0          4.136144
4          cotton blend        509            33490.0          4.146642
5               acrylic        276            13041.0          4.265408
6          poly chiffon        138             4778.0          4.068204
7                fleece        105             3912.0          3.882179
8            silk blend        223             2976.0          4.180634
9              art silk         73             2334.0          4.212370
10                nylon        156             2300.0          4.113502
11                 liva         23             1850.0          4.289823
12        chanderi silk         11             1840.0          4

occassion

In [None]:
import pandas as pd
import ast

# Example: assume df has columns ['attributes', 'ratingCount', 'avg_rating']

# If attributes are strings like "{'key': 'value', ...}", safely convert to dict
df['attributes'] = df['attributes'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

def analyze_key(df, key):
    # Extract the value for the given key from attributes dict
    df['attr_value'] = df['attributes'].apply(lambda x: x.get(key, None))

    # Group by that key's value
    result = df.groupby('attr_value').agg(
        frequency=('attr_value', 'count'),
        total_ratingCount=('ratingCount', 'sum'),
        avg_of_avgRating=('avg_rating', 'mean')
    ).reset_index()

    # Rank by total_ratingCount
    result = result.sort_values(by='total_ratingCount', ascending=False).reset_index(drop=True)

    return result

# Example usage:
result = analyze_key(df, 'occasion')
print(result)

     attr_value  frequency  total_ratingCount  avg_of_avgRating
0        casual       7533           620713.0          4.106727
1         daily       1637           284039.0          4.140141
2       festive       1131           106656.0          4.109429
3        ethnic       1072            78269.0          4.066478
4   traditional        312            32627.0          4.027659
5        fusion         82            23407.0          4.127720
6         party        842            21860.0          4.096987
7        formal        101             8893.0          4.062832
8       western        218             3854.0          4.105223
9        sports        192             3774.0          4.281164
10         work         54              974.0          4.208229
11    maternity         16              249.0          3.946031
12      outdoor         23              162.0          4.042164


print or pattern type

In [None]:
import pandas as pd
import ast

# Load dataset
path = "/content/drive/MyDrive/Women's Apparel Business/Fashion Dataset With Apparel.csv"
df = pd.read_csv(path)

# If attributes are strings like "{'key': 'value', ...}", safely convert to dict
df['attributes'] = df['attributes'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

def analyze_key(df, key):
    # Extract the value for the given key from attributes dict
    df['attr_value'] = df['attributes'].apply(lambda x: x.get(key, None))

    # Group by that key's value
    result = df.groupby('attr_value').agg(
        frequency=('attr_value', 'count'),
        total_ratingCount=('ratingCount', 'sum'),
        avg_of_avgRating=('avg_rating', 'mean')
    ).reset_index()

    # Rank by total_ratingCount
    result = result.sort_values(by='total_ratingCount', ascending=False).reset_index(drop=True)

    return result

# Example usage:
result = analyze_key(df, 'print or pattern type')
print(result)

            attr_value  frequency  total_ratingCount  avg_of_avgRating
0                solid       3928           359650.0          4.117127
1        ethnic motifs       1334           212609.0          4.139446
2               floral       1691           185149.0          4.089721
3            geometric        524            60360.0          4.142064
4              striped        388            18857.0          4.120842
5          self design        195            15571.0          4.112123
6              checked        182            14068.0          4.137189
7         woven design        164            12303.0          4.083014
8               washed         93            12047.0          4.380734
9          embellished        126            11090.0          4.088271
10            abstract        249             9937.0          4.150147
11            bandhani        113             8030.0          4.047461
12          typography        137             7031.0          4.285569
13    

type

In [None]:
import pandas as pd
import ast

# Load dataset
path = "/content/drive/MyDrive/Women's Apparel Business/Fashion Dataset With Apparel.csv"
df = pd.read_csv(path)

# If attributes are strings like "{'key': 'value', ...}", safely convert to dict
df['attributes'] = df['attributes'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

def analyze_key(df, key):
    # Extract the value for the given key from attributes dict
    df['attr_value'] = df['attributes'].apply(lambda x: x.get(key, None))

    # Group by that key's value
    result = df.groupby('attr_value').agg(
        frequency=('attr_value', 'count'),
        total_ratingCount=('ratingCount', 'sum'),
        avg_of_avgRating=('avg_rating', 'mean')
    ).reset_index()

    # Rank by total_ratingCount
    result = result.sort_values(by='total_ratingCount', ascending=False).reset_index(drop=True)

    return result

# Print all the rows, not just a range like 1,2,3,4....96,97,98,99,100
pd.set_option("display.max_rows", None)  # show all rows
pd.set_option("display.max_columns", None)  # show all columns
pd.set_option("display.width", None)  # don't wrap columns
pd.set_option("display.max_colwidth", None)  # show long strings fully


# Example usage:
result = analyze_key(df, 'type')
print(result)

               attr_value  frequency  total_ratingCount  avg_of_avgRating
0                 regular        706           127957.0          4.149995
1                  a-line        474            44046.0          4.099310
2                pullover        705            38328.0          4.204729
3        regular trousers        466            34485.0          3.985318
4                      na        554            32468.0          3.994579
5          basic jumpsuit        570            31240.0          4.103201
6             shirt style         93            28004.0          4.133039
7              front-open        125            24701.0          4.074673
8                  flared        361            18856.0          4.097541
9            denim jacket        206            18496.0          4.325032
10      parallel trousers        207            16961.0          4.098146
11                 fitted        127            15083.0          4.164102
12                 peplum         64  

character

In [None]:
import pandas as pd
import ast

# Load dataset
path = "/content/drive/MyDrive/Women's Apparel Business/Fashion Dataset With Apparel.csv"
df = pd.read_csv(path)

# If attributes are strings like "{'key': 'value', ...}", safely convert to dict
df['attributes'] = df['attributes'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

def analyze_key(df, key):
    # Extract the value for the given key from attributes dict
    df['attr_value'] = df['attributes'].apply(lambda x: x.get(key, None))

    # Group by that key's value
    result = df.groupby('attr_value').agg(
        frequency=('attr_value', 'count'),
        total_ratingCount=('ratingCount', 'sum'),
        avg_of_avgRating=('avg_rating', 'mean')
    ).reset_index()

    # Rank by total_ratingCount
    result = result.sort_values(by='total_ratingCount', ascending=False).reset_index(drop=True)

    return result

# Print all the rows, not just a range like 1,2,3,4....96,97,98,99,100
pd.set_option("display.max_rows", None)  # show all rows
pd.set_option("display.max_columns", None)  # show all columns
pd.set_option("display.width", None)  # don't wrap columns
pd.set_option("display.max_colwidth", None)  # show long strings fully


# Example usage:
result = analyze_key(df, 'character')
print(result)

          attr_value  frequency  total_ratingCount  avg_of_avgRating
0                 na       6815           532379.0          4.119709
1        donald duck          7               78.0          4.266984
2               nemo          1               68.0          4.235294
3       minnie mouse          3               66.0          4.453011
4             marvel          3               63.0          4.237399
5       mickey mouse          4               61.0          4.424110
6    powerpuff girls          5               57.0          3.879092
7            friends          1               25.0          4.360000
8             dexter          1               23.0          4.652174
9      kung fu panda          1               19.0          4.157895
10      wonder woman          2                8.0          4.375000
11      looney tunes          1                8.0          4.625000
12         lion king          1                6.0          4.333333
13              nasa          6   

Base Dataset Creation

In [None]:
import pandas as pd
import ast

# Load your dataset
df = pd.read_csv("/content/drive/MyDrive/Women's Apparel Business/Fashion Dataset With Apparel.csv")

# Convert stringified dictionaries into actual Python dicts
df["attributes"] = df["attributes"].apply(lambda x: ast.literal_eval(x))

# Step 1: Collect all unique keys across all rows
all_keys = set()
for d in df["attributes"]:
    all_keys.update(d.keys())

print(f"Found {len(all_keys)} unique attribute keys")

# Step 2: For each key, create a new column and fill values row-wise
for key in all_keys:
    df[key] = df["attributes"].apply(lambda x: x.get(key, None))

# ✅ Keep the original 'attributes' column for cross-checking

# Step 3: Save expanded dataset
save_path = "/content/drive/MyDrive/Women's Apparel Business/expanded_dataset.csv"
df.to_csv(save_path, index=False)

print(f"✅ Expanded dataset saved at: {save_path}")


Found 106 unique attribute keys


  df[key] = df["attributes"].apply(lambda x: x.get(key, None))
  df[key] = df["attributes"].apply(lambda x: x.get(key, None))
  df[key] = df["attributes"].apply(lambda x: x.get(key, None))
  df[key] = df["attributes"].apply(lambda x: x.get(key, None))
  df[key] = df["attributes"].apply(lambda x: x.get(key, None))
  df[key] = df["attributes"].apply(lambda x: x.get(key, None))
  df[key] = df["attributes"].apply(lambda x: x.get(key, None))
  df[key] = df["attributes"].apply(lambda x: x.get(key, None))
  df[key] = df["attributes"].apply(lambda x: x.get(key, None))
  df[key] = df["attributes"].apply(lambda x: x.get(key, None))


✅ Expanded dataset saved at: /content/drive/MyDrive/Women's Apparel Business/expanded_dataset.csv
