In [120]:
import os
import shutil
import re
import json
import pandas as pd
from pandas_profiling import ProfileReport
import rcssmin

In [121]:
def getFileByExtension(extension_name):
    if os.path.isdir("minified-dataset"):
        shutil.rmtree("minified-dataset", ignore_errors=True)

    os.mkdir("minified-dataset")
    
    valid_files = []

    for filePath, dirs, files in os.walk("original-dataset"):
        minifiedStartingPath = filePath.replace("original-dataset", "")

        for directory in dirs:
            os.mkdir(f"minified-dataset\\{minifiedStartingPath}\\{directory}")

        for file in files:
            if file[-len(extension_name):] == extension_name:
                valid_files.append((os.path.join(filePath, file), 
                    os.path.join(f"minified-dataset{minifiedStartingPath}", file)))

                with open(os.path.join(filePath, file), "rb") as f:
                    file_text = f.read().decode(errors="replace")

                with open(f"minified-dataset\\{minifiedStartingPath}\\{file}", "w") as f:
                    f.write(rcssmin.cssmin(file_text))
                

        # print(filePath)
        # print(dirs)
        # print(files)

    return valid_files


In [122]:
os.getcwd()

'c:\\Users\\mattg_3roa89k\\Documents\\Uni\\CSS1\\AT3\\css-refactor-tool'

In [123]:
css_files = getFileByExtension(".css")
# print(css_files)
print(len(css_files))
css_files[0]

552


('original-dataset\\9292.nl\\index_files\\print.min.css',
 'minified-dataset\\9292.nl\\index_files\\print.min.css')

In [124]:
with open("original-dataset\\GlobalTVBC\\css\\23579edb1e12f906d2b83f522d1c27011dc1b1d8.css", "rb") as file:
    test_file = file.read().decode(errors="replace")

### EDA Questions:

- File sizes
- Number of lines
- Number of selectors (tags, classes, ids)
- Number of media queries
- Number of keyframes
- Number of styles
- Number of styles per selector
- Number of valid styles


In [125]:
def dictToJSONFile(filePath, dictionary):
    with open(filePath, 'w') as fp:
        json.dump(dictionary, fp, indent=2)

Causes of errors:
- Invalid CSS (e.g. original-dataset\Alibaba\css\fe397ac3e7d34d455bd1114d623b1e62aa8ee51c.css)
    - filter: (filter attribute has no value and semicolon)
- Using base64 image backgrounds (e.g. original-dataset\Amazon.ca\css\d96313390c86b5833d30fbe6a5ed905fd3a0ea05.css)
    - the semi colon in the base64 image string causes the url to be seperated: background-image:url(data:image/gif;base64,R0lGODlhBwAKAMIEAObm5uvr6/Dw8P39/f///////////////yH5BAEKAAcALAAAAAAHAAoAAAMWSDPUGoE5AaIj1M4qMW+ZFDYD1ClnAgA7);

In [126]:
re.escape("/*")

'/\\*'

In [127]:
with open("./html_tags.json", "r") as f:
  html_tags = json.load(f)

def getFileSize(filePath):
    return os.path.getsize(filePath)

def getNumLines(css_text):
    return css_text.count("\n")+1

def removeComments(css_text):
    escaped_backlash = re.escape("\n")
    css_text = re.sub(f'{re.escape("/*")}.*?{re.escape("*/")}', '', css_text)
    css_text = re.sub(f'{re.escape("//")}.*?{escaped_backlash}', '', css_text)

    return css_text

def getSelectorsBySymbol(symbol, css_text):
    css_text = removeComments(css_text)

    selectors = set([])

    isSelector = False
    selector_name = ""

    for char in css_text:
        if isSelector:
            isSelector = not char in set([".", "#", ",", "{", "\n", ":", "[", "/", "\\", '"'])

            if isSelector:
                selector_name = f"{selector_name}{char}"

            if char.isdigit():
                isSelector = False
                selector_name = ""
            
            # print(char, selector_name)
        else:
            selectors.add(selector_name.strip())
            selector_name = ""
            isSelector = char == symbol

    if "" in selectors:
        selectors.remove("")

    return list(selectors)

def getSelectors(css_text):
    num_tags = sum(css_text.count(tag) for tag in html_tags)

    css_classes = getSelectorsBySymbol(".", css_text)
    css_ids = getSelectorsBySymbol("#", css_text)

    return css_classes, css_ids, {
        "num_tags": num_tags,
        "num_classes": len(css_classes),
        "num_ids": len(css_ids),
        "num_media_queries": css_text.count("@media"),
        "num_keyframes": css_text.count("@keyframes")
    }

In [128]:
def getStyles(css_text):
    css_text = removeComments(css_text)
    styles = re.findall(r'\{\s*(.*?)\s*\}', css_text, flags=re.DOTALL) # list of substrings between { and }
    # print("s", styles)

    def removeEmptyString(theList):
        return [x for x in theList if x != ""]

    def splitBySemiColon(styles):
        isInsideParenthesis = False
        splitStyles = []
        currentStyle = ""
        hasColon = False

        for char in styles:
            if not isInsideParenthesis and char == ";":
                if hasColon:
                    splitStyles.append(currentStyle)
                else:
                    splitStyles[len(splitStyles)-1] += currentStyle

                currentStyle = ""
                hasColon = False
               
                continue

            if char == ":":
                hasColon = True

            if char == "(":
                isInsideParenthesis = True
            elif char == ")":
                isInsideParenthesis = False

            currentStyle = f"{currentStyle}{char}"

        return splitStyles

    def splitStyles(styles):
        return list(
            map(
                lambda style : list(map(lambda key_value: key_value.strip(), 
                removeEmptyString(style.strip().split(":", 1)))), 
                removeEmptyString(splitBySemiColon(styles))))

    styles = list(map(splitStyles, styles))

    return styles

In [129]:
class StyleSyntaxException(Exception):
    def __init__(self, prev, current):
        self.prev = prev
        self.current = current

def getNumStyles(stylesheet):
    style_attributes = {}
    style_values = {}

    def addStyleToDict(dictionary, style):
        if style in dictionary:
            # some files had an error where there would be a random semi-colon in the attribute value
            # e.g. color: red; !important; this adds it to the previous line
            dictionary[style] += 1
        else:
            dictionary[style] = 1

    for styles in stylesheet:
        try:
            y = None

            for x in styles:
                attr, value = x
                addStyleToDict(style_attributes, attr)
                addStyleToDict(style_values, value)
                y = x
        except:
            # print("prev", y)
            # print("err", x)
            raise StyleSyntaxException(y, x)

    return style_attributes, style_values

In [130]:
# with open('test.txt', 'w') as f:
#     f.write(rcssmin.cssmin(test_file))

In [131]:
def test(a):
    a = f"{a}"
    return a

test(12)

'12'

In [132]:
css_texts = []
css_metadata = []
css_selectors = []
css_styles = []
file_errors = []
css_files_length = len(css_files)

for i, (css_file_path, css_minified_path) in enumerate(css_files):
    try:
        with open(css_file_path, "rb") as file:
            css_text = file.read().decode(errors="replace")

        with open(css_minified_path, "rb") as file:
            minified_css = file.read().decode(errors="replace")
        
        css_texts.append((css_text, minified_css))

        css_classes, css_ids, numSelectors = getSelectors(minified_css)

        styles = getStyles(minified_css.replace("\n", " "))

        css_selectors.append({
            "id": i,
            "cssFile": css_file_path,
            "minifiedFile": css_minified_path,
            "classes": css_classes,
            "ids": css_ids
        })

        css_styles.append({
            "id": i,
            "cssFile": css_file_path,
            "minifiedFile": css_minified_path,
            "styles": styles
        })

        metadata = {
            "id": i,
            "cssFile": css_file_path,
            "fileSize": getFileSize(css_file_path),
            "numLines": getNumLines(css_text),
            "numStyles": getNumStyles(styles)
        }

        metadata.update(numSelectors)

        css_metadata.append(metadata)
    except StyleSyntaxException as err:
        print(css_file_path, err.prev, err.current)
        file_errors.append(css_file_path)
        # print("err:", err)

    print(f"File: {i+1} / {css_files_length} | Number of Errors: {len(file_errors)}", end="\r")

print("\n", file_errors[:3])


File: 552 / 552 | Number of Errors: 0
 []


In [133]:
# print(css_metadata)
# print(css_texts[2])

In [137]:
# getStyles(css_texts[2])
getStyles(css_texts[4][1])

[[['color', 'rgb(0,0,0)']],
 [['margin', '0px']],
 [['border-collapse', 'collapse']],
 [],
 [['font-style', 'normal']],
 [],
 [],
 [['font-size', '100%']],
 [],
 [['border', '0px none']],
 [],
 [],
 [['font-family', 'inherit'], ['font-size', 'inherit']],
 [],
 [],
 [['font-family', '"FedraSansBookRegular"'],
  ['font-style', 'normal'],
  ['font-weight', 'normal']],
 [['font-family', '"FedraSansMedium"'],
  ['font-style', 'normal'],
  ['font-weight', 'normal']],
 [],
 [],
 [['font', '81.25%/1.125 Arial,Helvetica,sans-serif'],
  ['color', 'rgb(34,34,34)']],
 [],
 [['font-family', 'FedraSansBookRegular'],
  ['font-size', '2.46154em'],
  ['font-weight', 'normal'],
  ['line-height', '1em'],
  ['color', 'rgb(34,34,34)']],
 [],
 [['font-size', '2.76923em']],
 [['font-family', '"FedraSansBookRegular"'],
  ['font-size', '2.07692em'],
  ['color', 'rgb(34,34,34)']],
 [['font-size', '1.69231em'], ['color', 'rgb(13,47,56)']],
 [['font', 'bold 1.23077em/1.125 Arial,Helvetica,sans-serif']],
 [],
 [],

In [135]:
# getStyles(css_texts[5])

In [101]:
dictToJSONFile("./css-metadata.json", css_metadata)
dictToJSONFile("./css-selectors.json", css_selectors)
dictToJSONFile("./css-styles.json", css_styles)

In [None]:
css_metadata_df = pd.DataFrame.from_dict(css_metadata)
css_metadata_df

In [None]:
ProfileReport(css_metadata_df, title="CSS Metadata Report", explorative=True).to_file("./profile-reports/css-metadata-report.html")

In [None]:
css_selectors_df = pd.DataFrame.from_dict(css_selectors)
css_selectors_df

In [None]:
ProfileReport(css_selectors_df, title="CSS Selectors Report", explorative=True).to_file("./profile-reports/css-selectors-report.html")