In [2]:
import os
import time
import colorsys
import warnings
import json
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [3]:
def saveAsJson(filePath, data):
    with open(filePath, 'w') as fp:
        json.dump(data, fp, indent=2)

def getDefaultStyleProps(style):
    return {
        "style": style,
        "atRule": "",
        "mediaQuery": ""
    }

def getSelectorStyleMap(styles):
    styles = np.array(styles.split("{"))
    styles = list(map(lambda selectorStyle: selectorStyle.split("}"), styles))
    styles = [j for sub in styles for j in sub] # empty string means end of an at rule
    
    selectorToStyle = {}
    mediaQuery = ""
    atRule = ""
    isSelector = True

    for i, cssComponent in enumerate(styles):
        if "@media" in cssComponent:
            mediaQuery = cssComponent
            continue
        elif "@" in atRule:
            atRule = cssComponent
            continue

        if isSelector:
            selectorToStyle[cssComponent] = None
            isSelector = False
        else:
            selectorToStyle[styles[i-1]] = {
                "style": list(map(lambda x: x.split(":"), cssComponent.split(";"))),
                "atRule": atRule,
                "mediaQuery": mediaQuery
            }
            isSelector = True

    del selectorToStyle[""]

    return selectorToStyle

def colorToHexCode(value):
    try:
        # error is caused with gradients
        if "hsla" in value or "rgba" in value:
            return value

        # seperate hsl or rgb args
        value = value.replace("%", "")
        colors = [int(x) for x in value[value.find("(")+len("("):value.rfind(")")].split(",")]

        # if hsl convert to rgb
        if "hsl" in value:
            h, s, l = colors
            h = h/360
            s = s/100
            l = l/100
            colors = colorsys.hls_to_rgb(h, l, s)
            colors = [int(round(x*255.0)) for x in colors]

        # convert rgb to hexcode
        colors = list(map(lambda color: str(hex(color).split('x')[-1][-2:]).zfill(2), colors))
        colors = "".join(colors)

        return f"#{colors}"
    except:
        return value

def seperateSizeValues(values):
    values = values.replace(" ", "")
    values = values.replace("auto", "1auto")

    seperatedValues = []
    value = ""
    isUnit = False

    for char in values:
        if isUnit and char.isdigit():
            isUnit = False
            seperatedValues.append(value)
            value = ""

        value = f"{value}{char}"

        if not char.isdigit():
            isUnit = True

    seperatedValues.append(value)

    return [x if x != "1auto" else "auto" for x in seperatedValues]

def sizeShortHandToLongHand(attr, seperatedValues):
    try:
        fourPosArgs = [["top"], ["right"], ["bottom"], ["left"]]
        threePosArgs = [["top"], ["left", "right"], ["bottom"]]
        twoPosArgs = [["top", "bottom"], ["left", "right"]]

        posArgs = [twoPosArgs, threePosArgs, fourPosArgs][len(seperatedValues)-2]

        longHand = []

        for i, pos in enumerate(posArgs):
            for posName in pos:
                longHand.append([f"{attr}-{posName}", seperatedValues[i]])

        return longHand
    except:
        return None

def normalizeStyle(attr, value):
    isMultiSelectorChange = False

    if "color" in attr:
        if not "#" in value:
           value = colorToHexCode(value)
    elif attr == "margin" or attr == "padding":
        isMultiSelectorChange = True
        seperateValues = seperateSizeValues(value)

        if seperateValues:
            isMultiSelectorChange = False
            attr, value = None, sizeShortHandToLongHand(attr, seperateValues)

    return attr, value, isMultiSelectorChange

def getStyleText(styleMap):
    selectorStyles = [styleMap[key]["style"] for key in styleMap if not styleMap[key]["atRule"] and not styleMap[key]["mediaQuery"]]
    styles = []

    for selectorStyle in selectorStyles:
        style = ""
        for attr, value in selectorStyle:
            style += f"{attr}: {value}; "
        style = style.strip()

        styles.append(style)

    return styles

def insertElementsByIndices(list_a, list_b, pos):
    assert(len(list_b) == len(pos))
    acc = 0
    for i in range(len(list_b)):
        list_a.insert(pos[i]+acc, list_b[i])
        acc += 1

    return list_a

def getFinalStyleText(selector, styles):
    if type(styles) is list:
        return f"{selector}{'{'}{''.join([f'{x[0]}:{x[1]};' for x in styles])}{'}'}"
    else:
        return f"{selector}{'{'}{styles}{'}'}"

In [4]:
def generateRefactoredCss(filePath):
    backslash = "\\"
    outputFilePath = f"refactored-files{backslash}{filePath.replace(backslash, '-')}"

    with open(filePath, "rb") as file:
        css_text_test = file.read().decode(errors="replace")

    styleMap = getSelectorStyleMap(css_text_test)

    for selector in styleMap:
        if selector:
            for i, x in enumerate(styleMap[selector]["style"]):
                try:
                    attr, value = x
                    a, v, isMultiSelectorChange = normalizeStyle(attr, value)

                    if isMultiSelectorChange:
                        del attr, value
                        styleMap[selector]["style"].extend(v)
                    else:
                        attr, value = a, v
                except:
                    pass


    if "" in styleMap:
        del styleMap[""]

    # does not include styles in media queries
    testStyles = getStyleText(styleMap)

    runKmeans = True

    selector_styles_df = None

    if runKmeans:
        vectorizer = TfidfVectorizer(stop_words={'english'})
        X = vectorizer.fit_transform(testStyles)
        
        Sum_of_squared_distances = []
        K = range(2, len(testStyles))
        sil = []

        timeout = time.time() + 60*5

        with warnings.catch_warnings():
            warnings.filterwarnings('error')
            try:
                for k in K:
                    if time.time() > timeout:
                        break
                    km = KMeans(n_clusters=k, max_iter=10, n_init=10) # 150
                    km = km.fit(X)
                    Sum_of_squared_distances.append(km.inertia_)
                    sil.append(silhouette_score(X, km.labels_, metric="euclidean"))
                    print(f"Iteration: {k}/{len(testStyles)}", end="\r")
            except Warning:
                print("Training stopped early due to convergence warning")
                pass

        # plt.plot(K, Sum_of_squared_distances, 'bx-')
        # plt.xlabel('k')
        # plt.ylabel('Sum_of_squared_distances')
        # plt.title('Elbow Method For Optimal k')
        # plt.show()

        # rerunning this without the above cells will change the output values
        selector = list(styleMap.keys())

        true_k = sil.index(max(sil))+2
        model = KMeans(n_clusters=true_k, init='k-means++', max_iter=200, n_init=10)
        model.fit(X)
        labels = model.labels_

        selector_styles_df = pd.DataFrame(list(zip(selector,labels)),columns=['selector','cluster'])
        selector_styles_df = selector_styles_df.sort_values(by=['cluster'])
    else:
        selector_styles_df = pd.read_json("selectors-clustered.json")
        selector_styles_df.columns = ['selector','cluster']

    if runKmeans:
        saveAsJson(f"{outputFilePath[:-4]}.json", selector_styles_df.values.tolist())

    clusteredSelectors = selector_styles_df.values.tolist()
    clusterSelectorMap = {}

    for selector, clusterIndex in clusteredSelectors:
        if str(clusterIndex) in clusterSelectorMap:
            clusterSelectorMap[str(clusterIndex)].append(selector)
        else:
            clusterSelectorMap[str(clusterIndex)] = [selector]

    unsortedCombinedStyleMap = {}

    for clusterIndex in clusterSelectorMap:
        sharedStyles = {}
        sharedValues = {}

        for selector in clusterSelectorMap[clusterIndex]:
            for attr, value in styleMap[selector]["style"]:
                style = f"{attr.strip()}: {value.strip()};"

                if style in sharedStyles:
                    sharedStyles[style].append(selector)
                else:
                    sharedStyles[style] = [selector]
        
        for style in sharedStyles:
            combinedSelector = ", ".join(sharedStyles[style])
            unsortedCombinedStyleMap[combinedSelector] = style

    combinedSelectorOrder = []

    for combinedSelector in unsortedCombinedStyleMap:
        selectors = combinedSelector.split(", ")
        mapKeys = list(styleMap.keys())
        order = [mapKeys.index(selector) for selector in selectors]
        
        combinedSelectorOrder.append((min(order), combinedSelector))

    combinedSelectorOrder.sort(key=lambda x: x[0])
    combinedSelectorOrder

    selectorOrder = [x[0] for x in combinedSelectorOrder]
    selectorStylesInOrder = [[x[1], getDefaultStyleProps(unsortedCombinedStyleMap[x[1]])] for x in combinedSelectorOrder]
    selectorStylesInOrder

    for selector, styleProps in selectorStylesInOrder:
        selectors = selector.split(", ")
        styles = [x for x in styleProps["style"].split(";") if x != ""]
        
        for i, style in enumerate(styles):
            styles[i] = style.replace(" ", "").split(":")

        for selector in selectors:
            if selector in styleMap:
                for i, originalStyle in enumerate(styleMap[selector]["style"]):
                    a1, v1 = originalStyle
                    for a2, v2 in styles:
                        if a1 == a2 and v1 == v2:
                            del styleMap[selector]["style"][i]

    styles = [[selector, styleMap[selector]] for selector in styleMap]
    styles = insertElementsByIndices(styles, selectorStylesInOrder, selectorOrder)

    refactoredCssText = ""

    for selector, styleProps in styles:
        if not styleProps["atRule"] == "" or len(styleProps["style"]) == 0:
            continue

        if styleProps["mediaQuery"] == "":
            refactoredCssText += getFinalStyleText(selector, styleProps["style"])
        else:
            refactoredCssText += f"{styleProps['mediaQuery']}{'{'}{getFinalStyleText(selector, styleProps['style'])}{'}'}"

    with open(outputFilePath, "w") as f:
        f.write(refactoredCssText)

In [12]:
generateRefactoredCss("minified-dataset\\Facebook\\index_files\\1GsqYFnXaZQ.css")



In [5]:
errorCount = 0

for i, (filePath, dirs, files) in enumerate(os.walk("minified-dataset")):
    for file in files:
        try:
            generateRefactoredCss(os.path.join(filePath, file))
            print(f"Number of files refactored: {i}", end="\r")
        except:
            errorCount += 1

Number of files refactored: 191

In [6]:
errorCount

458