## Makeup Foundation Analysis
Kristen Shen

In [8]:
import pandas as pd

In [11]:
from collections import Counter
import re

In [9]:
# loading the data
df = pd.read_csv("allNumbers.csv")
df.head()

Unnamed: 0,brand,product,name,specific,lightness,hex,lightToDark,numbers,id
0,Makeup Revolution,Conceal & Define Full Coverage Foundation,,F0,0.94902,#F2F2F2,True,0.0,1.0
1,HOURGLASS,Veil Fluid Makeup,Porcelain,No. 0,0.817647,#F6D3AB,True,0.0,2.0
2,TOM FORD,Traceless Soft Matte Foundation,Pearl,0.0,0.85098,#F0D8C2,True,0.0,3.0
3,Armani Beauty,Neo Nude Foundation,,0,0.911765,#F0E8E1,True,0.0,4.0
4,TOM FORD,Traceless Foundation Stick,Pearl,0.0,0.911765,#FDE5D4,True,0.0,5.0


### Data cleaning

In [12]:
# get the column of product's name
text = df['product']
text

0               Conceal & Define Full Coverage Foundation
1                                       Veil Fluid Makeup
2                         Traceless Soft Matte Foundation
3                                     Neo Nude Foundation
4                              Traceless Foundation Stick
                              ...                        
3112                                I Am Magic Foundation
3113    Pretty in Bloom™ SPF 20 Flower-Infused Long-We...
3114                                I Am Magic Foundation
3115    Pretty in Bloom™ SPF 20 Flower-Infused Long-We...
3116            Clear Complexion Make Myself Clear Makeup
Name: product, Length: 3117, dtype: object

In [21]:
# change to lower case for comparision 
name_str = text.astype(str).str.lower()
name_str

0               conceal & define full coverage foundation
1                                       veil fluid makeup
2                         traceless soft matte foundation
3                                     neo nude foundation
4                              traceless foundation stick
                              ...                        
3112                                i am magic foundation
3113    pretty in bloom™ spf 20 flower-infused long-we...
3114                                i am magic foundation
3115    pretty in bloom™ spf 20 flower-infused long-we...
3116            clear complexion make myself clear makeup
Name: product, Length: 3117, dtype: object

In [34]:
# Remove unwanted characters and extra spaces
name_str = name_str.str.replace(r"[^\w\s]", "", regex=True)  # \s handles different whitespace types
name_str = name_str.str.replace(r"\s+", " ", regex=True).str.strip()  # Replace multiple spaces with a single space


In [35]:
# split by each word
words = name_str.str.split(" ")
words

0           [conceal, define, full, coverage, foundation]
1                                   [veil, fluid, makeup]
2                    [traceless, soft, matte, foundation]
3                                 [neo, nude, foundation]
4                          [traceless, foundation, stick]
                              ...                        
3112                           [i, am, magic, foundation]
3113    [pretty, in, bloom, spf, 20, flowerinfused, lo...
3114                           [i, am, magic, foundation]
3115    [pretty, in, bloom, spf, 20, flowerinfused, lo...
3116     [clear, complexion, make, myself, clear, makeup]
Name: product, Length: 3117, dtype: object

### word frequency

In [73]:
# Flatten the list of words
all_words = [word for sublist in words for word in sublist]

# Count frequency 
word_counts = Counter(all_words)

# Display the most common words
# word_counts.most_common()

In [74]:
# Convert word frequency count to DataFrame
df_word_counts = pd.DataFrame(word_counts.most_common(), columns=['Word', 'Count'])

# Save to CSV
# df_word_counts.to_csv("word_counts.csv", index=False)


### brand with most porducts using same names

In [65]:
# check which brand has the most products with same names
# Counter(text)

In [78]:
# Convert Counter to DataFrame
df_product = pd.DataFrame(Counter(text).items(), columns=['product', 'count'])

# Sort by count in descending order
df_product = df_product.sort_values(by='count', ascending=False)
df_product

Unnamed: 0,product,count
69,10 Hour Wear Perfection Foundation,60
135,Ultra HD Invisible Cover Foundation,50
112,Pro Filt'r Hydrating Longwear Foundation,50
110,Pro Filt'r Soft Matte Longwear Foundation,50
109,Pro Filt'r Soft Matte Powder Foundation,50
...,...,...
96,BB Cream Tinted Moisturizer Broad Spectrum SPF20,1
94,Full Coverage Cream Foundation,1
92,Hello Happy Soft Blur Foundation,1
79,Bienfait Teinté Beauty Balm Sunscreen Broad Sp...,1


In [79]:
# save as a csv file
df_product.to_csv('product_count.csv', index = False)

In [109]:
# Merge product_count.csv back with the original dataset
df_merged = df.merge(df_product, on="product", how="right")
df_merged

Unnamed: 0,brand,product,name,specific,lightness,hex,lightToDark,numbers,id,count
0,SEPHORA COLLECTION,10 Hour Wear Perfection Foundation,Pearl,03,0.864706,#EAD5CF,True,3.0,66.0,60
1,SEPHORA COLLECTION,10 Hour Wear Perfection Foundation,Light Porcelain,04,0.841176,#EACDC3,True,4.0,66.0,60
2,SEPHORA COLLECTION,10 Hour Wear Perfection Foundation,Light Porcelain,05,0.739216,#DEB39B,True,5.0,66.0,60
3,SEPHORA COLLECTION,10 Hour Wear Perfection Foundation,Light Ivory,8,0.731373,#D9B19C,True,8.0,66.0,60
4,SEPHORA COLLECTION,10 Hour Wear Perfection Foundation,Light Ivory,10,0.684314,#DCAA81,True,10.0,66.0,60
...,...,...,...,...,...,...,...,...,...,...
2996,Erborian,BB Cream Tinted Moisturizer Broad Spectrum SPF20,Erborian BB Cream Tinted Moisturizer Broad Spe...,SPF20,0.970588,#F9F7F6,,20.0,,1
2997,Black Up,Full Coverage Cream Foundation,HC,12,0.450980,#935F53,,12.0,,1
2998,Benefit Cosmetics,Hello Happy Soft Blur Foundation,,Shade 11,0.609804,#C9866E,,11.0,,1
2999,Lancôme,Bienfait Teinté Beauty Balm Sunscreen Broad Sp...,Sand,5,0.645098,#D29F77,,5.0,,1


In [116]:
df_order = df_merged.sort_values(by='count',ascending=False)

In [121]:
# Group by brand and product, keeping the maximum count (avoiding incorrect summation)
df_grouped = df_merged.groupby(['brand', 'product'], as_index=False)['count'].max()

# Sort by count in descending order
df_grouped = df_grouped.sort_values(by='count', ascending=False)
df_grouped


# Save to CSV (optional)
# df_grouped.to_csv("grouped_products_corrected_counts.csv", index=False)


Unnamed: 0,brand,product,count
106,SEPHORA COLLECTION,10 Hour Wear Perfection Foundation,60
79,MAKE UP FOR EVER,Ultra HD Invisible Cover Foundation,50
4,Anastasia Beverly Hills,Luminous Foundation,50
45,FENTY BEAUTY by Rihanna,Pro Filt'r Soft Matte Powder Foundation,50
44,FENTY BEAUTY by Rihanna,Pro Filt'r Soft Matte Longwear Foundation,50
...,...,...,...
88,Marc Jacobs Beauty,Shameless Youthful-Look 24H Foundation SPF 25,1
40,Erborian,BB Cream Tinted Moisturizer Broad Spectrum SPF20,1
136,bareMinerals,bareSkin® Pure Brightening Serum Foundation Br...,1
115,Shiseido,Sports BB SPF 50+,1


In [122]:
# save as a new csv file
df_grouped.to_csv("grouped_products_counts.csv", index=False)

### brand with the most products in total

In [41]:
df['brand'].value_counts()

brand
bareMinerals               349
SEPHORA COLLECTION         224
MAKE UP FOR EVER           170
FENTY BEAUTY by Rihanna    150
Lancôme                    128
                          ... 
AMOREPACIFIC                 5
Physicians Formula           4
Marc Jacobs Beauty           3
Erborian                     1
Black Up                     1
Name: count, Length: 64, dtype: int64

In [125]:
df_brandmost = pd.DataFrame(df['brand'].value_counts().items(), columns=['brand', 'count'])
df_brandmost

Unnamed: 0,brand,count
0,bareMinerals,349
1,SEPHORA COLLECTION,224
2,MAKE UP FOR EVER,170
3,FENTY BEAUTY by Rihanna,150
4,Lancôme,128
...,...,...
59,AMOREPACIFIC,5
60,Physicians Formula,4
61,Marc Jacobs Beauty,3
62,Erborian,1


In [126]:
# save as a new csv file
df_brandmost.to_csv("popular_brand.csv", index=False)

### Synonyms analysis

In [None]:
# define synonyms
synonyms = {
    "spf": "sun_protection",
    "hydrating": "moisturizing",
    "moisture": "moisturizing",
    "matte": "shine_free",
    "longwear": "long_last",
    "long-wear": "long_last",
    "foundation": "base",
    "concealer": "cover",
    "powder": "base",
    "liquid": "fluid",
}
