In [None]:
import os
import shutil
import re
import json
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport
import rcssmin

In [None]:
def getFileByExtension(extension_name):
    if os.path.isdir("minified-dataset"):
        shutil.rmtree("minified-dataset", ignore_errors=True)

    os.mkdir("minified-dataset")
    
    valid_files = []

    for filePath, dirs, files in os.walk("original-dataset"):
        minifiedStartingPath = filePath.replace("original-dataset", "")

        for directory in dirs:
            os.mkdir(f"minified-dataset\\{minifiedStartingPath}\\{directory}")

        for file in files:
            if file[-len(extension_name):] == extension_name:
                valid_files.append((os.path.join(filePath, file), 
                    os.path.join(f"minified-dataset{minifiedStartingPath}", file)))

                with open(os.path.join(filePath, file), "rb") as f:
                    file_text = f.read().decode(errors="replace")

                with open(f"minified-dataset\\{minifiedStartingPath}\\{file}", "w") as f:
                    f.write(rcssmin.cssmin(file_text))
                

        # print(filePath)
        # print(dirs)
        # print(files)

    return valid_files


In [None]:
os.getcwd()

In [None]:
css_files = getFileByExtension(".css")
# print(css_files)
print(len(css_files))
css_files[0]

In [None]:
with open("original-dataset\\GlobalTVBC\\css\\23579edb1e12f906d2b83f522d1c27011dc1b1d8.css", "rb") as file:
    test_file = file.read().decode(errors="replace")

### EDA Questions:

- File sizes
- Number of lines
- Number of selectors (tags, classes, ids)
- Number of media queries
- Number of keyframes
- Number of styles
- Number of styles per selector
- Number of valid styles


In [None]:
def dictToJSONFile(filePath, dictionary):
    with open(filePath, 'w') as fp:
        json.dump(dictionary, fp, indent=2)

Causes of errors:
- Invalid CSS (e.g. original-dataset\Alibaba\css\fe397ac3e7d34d455bd1114d623b1e62aa8ee51c.css)
    - filter: (filter attribute has no value and semicolon)
- Using base64 image backgrounds (e.g. original-dataset\Amazon.ca\css\d96313390c86b5833d30fbe6a5ed905fd3a0ea05.css)
    - the semi colon in the base64 image string causes the url to be seperated: background-image:url(data:image/gif;base64,R0lGODlhBwAKAMIEAObm5uvr6/Dw8P39/f///////////////yH5BAEKAAcALAAAAAAHAAoAAAMWSDPUGoE5AaIj1M4qMW+ZFDYD1ClnAgA7);

In [None]:
re.escape("/*")

In [None]:
with open("./html_tags.json", "r") as f:
  html_tags = json.load(f)

def getFileSize(filePath):
    return os.path.getsize(filePath)

def getNumLines(css_text):
    return css_text.count("\n")+1

def removeComments(css_text):
    escaped_backlash = re.escape("\n")
    css_text = re.sub(f'{re.escape("/*")}.*?{re.escape("*/")}', '', css_text)
    css_text = re.sub(f'{re.escape("//")}.*?{escaped_backlash}', '', css_text)

    return css_text

def getSelectorsBySymbol(symbol, css_text):
    css_text = removeComments(css_text)

    selectors = set([])

    isSelector = False
    selector_name = ""

    for char in css_text:
        if isSelector:
            isSelector = not char in set([" ", ".", "#", ",", "{", "\n", ":", "[", "/", "\\", 
                '"', ">", "+", "~"])

            if isSelector:
                selector_name = f"{selector_name}{char}"

            if char.isdigit():
                isSelector = False
                selector_name = ""
            
            # print(char, selector_name)
        else:
            selectors.add(selector_name.strip())
            selector_name = ""
            isSelector = char == symbol

    if "" in selectors:
        selectors.remove("")

    return list(selectors)

def getSelectors(css_text):
    num_tags = sum(css_text.count(tag) for tag in html_tags)

    css_classes = getSelectorsBySymbol(".", css_text)
    css_ids = getSelectorsBySymbol("#", css_text)

    return css_classes, css_ids, {
        "num_tags": num_tags,
        "num_classes": len(css_classes),
        "num_ids": len(css_ids),
        "num_rules": css_text.count("@"),
        "num_media_queries": css_text.count("@media"),
        "num_keyframes": css_text.count("@keyframes")
    }

In [None]:
def removeEmptyString(theList):
        return [x for x in theList if x != ""]

def splitBySemiColon(styles):
    isInsideParenthesis = False
    splitStyles = []
    currentStyle = ""
    hasColon = False

    for i, char in enumerate(styles):
        if not isInsideParenthesis and (char == ";" or i == len(styles)-1):
            if  i == len(styles)-1:
                currentStyle = f"{currentStyle}{char}"
            
            # print(i, currentStyle, len(splitStyles), char == ";", i == len(styles)-1)

            if hasColon:
                splitStyles.append(currentStyle)
            else:
                try:
                    splitStyles[len(splitStyles)-1] += currentStyle
                except:
                    return []

            currentStyle = ""
            hasColon = False
            
            continue

        if char == ":":
            hasColon = True

        if char == "(":
            isInsideParenthesis = True
        elif char == ")":
            isInsideParenthesis = False

        currentStyle = f"{currentStyle}{char}"

    return splitStyles

def splitByColon(styles):
    return list(map(lambda style : removeEmptyString(style.split(":", 1)), styles))

def getStyles(css_text):
    css_text = removeComments(css_text)
    styles = re.findall(r'\{\s*(.*?)\s*\}', css_text, flags=re.DOTALL) # list of substrings between { and }
    # print("s", styles)

    stylesSplitSemicolon = list(map(splitBySemiColon, styles))
    # print(stylesSplitSemicolon)
    stylesSplitColon = list(map(splitByColon, stylesSplitSemicolon))

    return stylesSplitColon

In [None]:
test = "html{color:rgb(0,0,0);background:rgb(255,255,255) none repeat scroll 0% 0%}"
getStyles(test)

In [None]:
class StyleSyntaxException(Exception):
    def __init__(self, prev, current):
        self.prev = prev
        self.current = current

def getNumStyles(stylesheet):
    style_attributes = {}
    style_values = {}

    def addStyleToDict(dictionary, style):
        if style in dictionary:
            # some files had an error where there would be a random semi-colon in the attribute value
            # e.g. color: red; !important; this adds it to the previous line
            dictionary[style] += 1
        else:
            dictionary[style] = 1

    for styles in stylesheet:
        try:
            y = None

            for x in styles:
                attr, value = x
                addStyleToDict(style_attributes, attr)
                addStyleToDict(style_values, value)
                y = x
        except:
            # print("prev", y)
            # print("err", x)
            raise StyleSyntaxException(y, x)

    return style_attributes, style_values

In [None]:
# with open('test.txt', 'w') as f:
#     f.write(rcssmin.cssmin(test_file))

In [None]:
css_texts = []
css_metadata = []
css_selectors = []
css_styles = []
file_errors = []
css_files_length = len(css_files)

for i, (css_file_path, css_minified_path) in enumerate(css_files):
    try:
        # print(css_minified_path)

        with open(css_file_path, "rb") as file:
            css_text = file.read().decode(errors="replace")

        with open(css_minified_path, "rb") as file:
            minified_css = file.read().decode(errors="replace")
        
        css_texts.append((css_text, minified_css))

        css_classes, css_ids, numSelectors = getSelectors(minified_css)

        styles = getStyles(minified_css.replace("\n", " "))

        css_selectors.append({
            "id": i,
            "cssFile": css_file_path,
            "minifiedFile": css_minified_path,
            "classes": css_classes,
            "ids": css_ids
        })

        css_styles.append({
            "id": i,
            "cssFile": css_file_path,
            "minifiedFile": css_minified_path,
            "styles": styles
        })

        metadata = {
            "id": i,
            "cssFile": css_file_path,
            "fileSize": getFileSize(css_file_path),
            "numLines": getNumLines(css_text),
            "numStyles": getNumStyles(styles)
        }

        metadata.update(numSelectors)

        css_metadata.append(metadata)
    except StyleSyntaxException as err:
        print(css_file_path, err.prev, err.current)
        file_errors.append(css_file_path)
        # print("err:", err)

    print(f"File: {i+1} / {css_files_length} | Number of Errors: {len(file_errors)}", end="\r")

print("\n", file_errors[:3])


In [None]:
# print(css_metadata)
# print(css_texts[2])

In [None]:
css_files[4][0]

In [None]:
x = "height: 50px"
x.split(";")

In [None]:
# getStyles(css_texts[2])
getStyles(css_texts[4][1])

In [None]:
totalProperties = {}
totalValues = {}

numPropRepititionsPerFile = np.array([])
numValueRepititionsPerFile = np.array([])

def combineStyles(combinedDict, oldDict):
  for key in oldDict:
    if key in combinedDict:
      combinedDict[key] += oldDict[key]
    else:
      combinedDict[key] = oldDict[key]

for metadata in css_metadata:
  properties, values = metadata["numStyles"]

  numPropRepititionsPerFile = np.append(numPropRepititionsPerFile, sum(list(properties.values())) - len(properties))
  numValueRepititionsPerFile = np.append(numValueRepititionsPerFile, sum(list(values.values())) - len(values))

  combineStyles(totalProperties, properties)
  combineStyles(totalValues, values)

totalProperties = {
  "property": list(totalProperties.keys()),
  "frequency": list(totalProperties.values())
}

totalValues = {
  "values": list(totalValues.keys()),
  "frequency": list(totalValues.values())
}


In [91]:
print(numPropRepititionsPerFile.mean())
print(numValueRepititionsPerFile.mean())

1211.768115942029
949.8876811594203


In [92]:
print(np.median(numPropRepititionsPerFile))
print(np.median(numValueRepititionsPerFile))

400.5
278.5


In [None]:
totalPropertiesDf = pd.DataFrame.from_dict(totalProperties)
totalValuesDf = pd.DataFrame.from_dict(totalValues)

In [None]:
ProfileReport(totalPropertiesDf, title="Style Properties Report", explorative=True).to_file("./profile-reports/style-properties-report.html")
ProfileReport(totalValuesDf, title="Style values Report", explorative=True).to_file("./profile-reports/style-values-report.html")

In [None]:
dictToJSONFile("./css-metadata.json", css_metadata)
dictToJSONFile("./css-selectors.json", css_selectors)
dictToJSONFile("./css-styles.json", css_styles)

In [None]:
css_metadata_df = pd.DataFrame.from_dict(css_metadata)
# css_metadata_df

In [None]:
ProfileReport(css_metadata_df, title="CSS Metadata Report", explorative=True).to_file("./profile-reports/css-metadata-report.html")

In [None]:
css_selectors_df = pd.DataFrame.from_dict(css_selectors)
# css_selectors_df

In [None]:
ProfileReport(css_selectors_df, title="CSS Selectors Report", explorative=True).to_file("./profile-reports/css-selectors-report.html")