### Goals
- Convert CSS to SASS:
    - Common values go into variables (e.g. colours)
    - Common groups of styles (>= 2 styles) go into mixins
- NLP to identify similar classes and ids
- NLP to group selectors together and put them into seperate files
- Minify

In [27]:
import os
import shutil
import re
import colorsys
import json
import numpy as np
import rcssmin
from sklearn.feature_extraction.text import TfidfVectorizer

In [28]:
# original-dataset\\GlobalTVBC\\css\\23579edb1e12f906d2b83f522d1c27011dc1b1d8.css
with open("minified-dataset\\Facebook\\index_files\\1GsqYFnXaZQ.css", "rb") as file:
    css_text_test = file.read().decode(errors="replace")

In [29]:
def getSelectorStyleMap(styles):
    styles = np.array(styles.split("{"))
    styles = list(map(lambda selectorStyle: selectorStyle.split("}"), styles))
    styles = [j for sub in styles for j in sub] # empty string means end of an at rule
    
    selectorToStyle = {}
    mediaQuery = ""
    atRule = ""
    isSelector = True

    for i, cssComponent in enumerate(styles):
        if "@media" in cssComponent:
            mediaQuery = cssComponent
            continue
        elif "@" in atRule:
            atRule = cssComponent
            continue

        if isSelector:
            selectorToStyle[cssComponent] = None
            isSelector = False
        else:
            selectorToStyle[styles[i-1]] = {
                "style": list(map(lambda x: x.split(":"), cssComponent.split(";"))),
                "atRule": atRule,
                "mediaQuery": mediaQuery
            }
            isSelector = True

    return selectorToStyle

In [30]:
testStyleMap = getSelectorStyleMap(css_text_test)
testStyleMap

{'form': {'style': [['margin', '0'], ['padding', '0']],
  'atRule': '',
  'mediaQuery': ''},
 'label': {'style': [['cursor', 'pointer'],
   ['color', '#666'],
   ['font-weight', 'bold'],
   ['vertical-align', 'middle']],
  'atRule': '',
  'mediaQuery': ''},
 'label input': {'style': [['font-weight', 'normal']],
  'atRule': '',
  'mediaQuery': ''},
 'textarea,.inputtext,.inputpassword': {'style': [['border',
    '1px solid #bdc7d8'],
   ['margin', '0'],
   ['padding', '3px'],
   ['-webkit-appearance', 'none'],
   ['-webkit-border-radius', '0']],
  'atRule': '',
  'mediaQuery': ''},
 'textarea': {'style': [['max-width', '100%']],
  'atRule': '',
  'mediaQuery': ''},
 'select': {'style': [['border', '1px solid #bdc7d8'], ['padding', '2px']],
  'atRule': '',
  'mediaQuery': ''},
 '.inputtext,.inputpassword': {'style': [['padding-bottom', '4px']],
  'atRule': '',
  'mediaQuery': ''},
 '.inputtext:invalid,.inputpassword:invalid': {'style': [['-webkit-box-shadow',
    'none']],
  'atRule': ''

- convert all values to same format (e.g. color will be all hex) (done)
- don't convert but additional store the longhand for shorthands (e.g. padding: 10px 20px 20px 10px)

Combine styles:
- Combine them using the comma seperator if they share >=2 styles

Variables:
- for all attribute values, if it is repeated >=10 than replace it with a variable

Mixins:
- If two or more selectors share >=3 properties but different values then created a mixin that shares that has parameters for the different values

Issue: How do you automate naming variables and mixins?

In [31]:
def colorToHexCode(value):
    try:
        # error is caused with gradients
        if "hsla" in value or "rgba" in value:
            return value

        # seperate hsl or rgb args
        value = value.replace("%", "")
        colors = [int(x) for x in value[value.find("(")+len("("):value.rfind(")")].split(",")]

        # if hsl convert to rgb
        if "hsl" in value:
            h, s, l = colors
            h = h/360
            s = s/100
            l = l/100
            colors = colorsys.hls_to_rgb(h, l, s)
            colors = [int(round(x*255.0)) for x in colors]

        # convert rgb to hexcode
        colors = list(map(lambda color: str(hex(color).split('x')[-1][-2:]).zfill(2), colors))
        colors = "".join(colors)

        return f"#{colors}"
    except:
        return value

In [32]:
print(colorToHexCode("rgb(3, 252, 198)"))
print(colorToHexCode("hsl(167, 98%, 50%)"))

#03fcc6
#03fcc6


In [33]:
mapShortHands = {} # store shorthands here (not implemented yet)

def normalizeStyle(attr, value):
    if "color" in attr:
        if not "#" in value:
           value = colorToHexCode(value)

    return attr, value

for selector in testStyleMap:
    if selector:
        for attr, value in testStyleMap[selector]["style"]:
            attr, value = normalizeStyle(attr, value)

del testStyleMap[""]

# testStyleMap

In [34]:
# duplicatedStyles = {} # type 1 and 2 duplications

# for selector in testStyleMap:
#     if selector:
#         for attr, value in testStyleMap[selector]["style"]:
#             style = f"{attr}:{value}"
#             if style in duplicatedStyles:
                
# 
# # remove styles with only one selector

In [36]:
[testStyleMap[key]["style"] for key in testStyleMap if testStyleMap[key]["atRule"] == "" and testStyleMap[key]["mediaQuery"] == ""]

[[['margin', '0'], ['padding', '0']],
 [['cursor', 'pointer'],
  ['color', '#666'],
  ['font-weight', 'bold'],
  ['vertical-align', 'middle']],
 [['font-weight', 'normal']],
 [['border', '1px solid #bdc7d8'],
  ['margin', '0'],
  ['padding', '3px'],
  ['-webkit-appearance', 'none'],
  ['-webkit-border-radius', '0']],
 [['max-width', '100%']],
 [['border', '1px solid #bdc7d8'], ['padding', '2px']],
 [['padding-bottom', '4px']],
 [['-webkit-box-shadow', 'none']],
 [['padding', '0'], ['margin', '0 5px 0 0'], ['vertical-align', 'middle']],
 [['border', '0'], ['vertical-align', 'middle']],
 [['border-style', 'solid'],
  ['border-width', '1px'],
  ['border-color', '#d9dfea #0e1f5b #0e1f5b #d9dfea'],
  ['background-color', '#3b5998'],
  ['color', '#fff'],
  ['padding', '2px 15px 3px 15px'],
  ['text-align', 'center']],
 [['background-color', '#999'],
  ['border-bottom', '1px solid #000'],
  ['border-right', '1px solid #666'],
  ['color', '#fff']],
 [['background', '#f0f0f0'],
  ['border-color

In [90]:
def getFileByExtension(extension_name):
    if os.path.isdir("minified-dataset"):
        shutil.rmtree("minified-dataset", ignore_errors=True)

    os.mkdir("minified-dataset")
    
    valid_files = []

    for filePath, dirs, files in os.walk("original-dataset"):
        minifiedStartingPath = filePath.replace("original-dataset", "")

        for directory in dirs:
            os.mkdir(f"minified-dataset\\{minifiedStartingPath}\\{directory}")

        for file in files:
            if file[-len(extension_name):] == extension_name:
                valid_files.append((os.path.join(filePath, file), 
                    os.path.join(f"minified-dataset{minifiedStartingPath}", file)))

                with open(os.path.join(filePath, file), "rb") as f:
                    file_text = f.read().decode(errors="replace")

                with open(f"minified-dataset\\{minifiedStartingPath}\\{file}", "w") as f:
                    f.write(rcssmin.cssmin(file_text))

    return valid_files

In [53]:
css_files = getFileByExtension(".css")
# print(css_files)
print(len(css_files))
css_files[0]

552


('original-dataset\\9292.nl\\index_files\\print.min.css',
 'minified-dataset\\9292.nl\\index_files\\print.min.css')

In [54]:
style_sheets = []

for i, (css_file_path, css_minified_path) in enumerate(css_files):
    with open(css_minified_path, "rb") as file:
            minified_css = file.read().decode(errors="replace")

    style_sheets.append(getSelectorStyleMap(minified_css))
    print(f"File: {i+1} / {len(css_files)}", end="\r")

# style_sheets

File: 552 / 552