In [261]:
import os
import shutil
import re
import json
import pandas as pd
from pandas_profiling import ProfileReport
import rcssmin

In [262]:
def getFileByExtension(extension_name):
    if os.path.isdir("minified-dataset"):
        shutil.rmtree("minified-dataset", ignore_errors=True)

    os.mkdir("minified-dataset")
    
    valid_files = []

    for filePath, dirs, files in os.walk("original-dataset"):
        minifiedStartingPath = filePath.replace("original-dataset", "")

        for directory in dirs:
            os.mkdir(f"minified-dataset\\{minifiedStartingPath}\\{directory}")

        for file in files:
            if file[-len(extension_name):] == extension_name:
                valid_files.append((os.path.join(filePath, file), 
                    os.path.join(f"minified-dataset{minifiedStartingPath}", file)))

                with open(os.path.join(filePath, file), "rb") as f:
                    file_text = f.read().decode(errors="replace")

                with open(f"minified-dataset\\{minifiedStartingPath}\\{file}", "w") as f:
                    f.write(rcssmin.cssmin(file_text))
                

        # print(filePath)
        # print(dirs)
        # print(files)

    return valid_files


In [263]:
os.getcwd()

'c:\\Users\\mattg_3roa89k\\Documents\\Uni\\CSS1\\AT3\\css-refactor-tool'

In [264]:
css_files = getFileByExtension(".css")
# print(css_files)
print(len(css_files))
css_files[0]

552


('original-dataset\\9292.nl\\index_files\\print.min.css',
 'minified-dataset\\9292.nl\\index_files\\print.min.css')

In [265]:
with open("original-dataset\\GlobalTVBC\\css\\23579edb1e12f906d2b83f522d1c27011dc1b1d8.css", "rb") as file:
    test_file = file.read().decode(errors="replace")

### EDA Questions:

- File sizes
- Number of lines
- Number of selectors (tags, classes, ids)
- Number of media queries
- Number of keyframes
- Number of styles
- Number of styles per selector
- Number of valid styles


In [266]:
def dictToJSONFile(filePath, dictionary):
    with open(filePath, 'w') as fp:
        json.dump(dictionary, fp, indent=2)

Causes of errors:
- Invalid CSS (e.g. original-dataset\Alibaba\css\fe397ac3e7d34d455bd1114d623b1e62aa8ee51c.css)
    - filter: (filter attribute has no value and semicolon)
- Using base64 image backgrounds (e.g. original-dataset\Amazon.ca\css\d96313390c86b5833d30fbe6a5ed905fd3a0ea05.css)
    - the semi colon in the base64 image string causes the url to be seperated: background-image:url(data:image/gif;base64,R0lGODlhBwAKAMIEAObm5uvr6/Dw8P39/f///////////////yH5BAEKAAcALAAAAAAHAAoAAAMWSDPUGoE5AaIj1M4qMW+ZFDYD1ClnAgA7);

In [267]:
re.escape("/*")

'/\\*'

In [268]:
with open("./html_tags.json", "r") as f:
  html_tags = json.load(f)

def getFileSize(filePath):
    return os.path.getsize(filePath)

def getNumLines(css_text):
    return css_text.count("\n")+1

def removeComments(css_text):
    escaped_backlash = re.escape("\n")
    css_text = re.sub(f'{re.escape("/*")}.*?{re.escape("*/")}', '', css_text)
    css_text = re.sub(f'{re.escape("//")}.*?{escaped_backlash}', '', css_text)

    return css_text

def getSelectorsBySymbol(symbol, css_text):
    css_text = removeComments(css_text)

    selectors = set([])

    isSelector = False
    selector_name = ""

    for char in css_text:
        if isSelector:
            isSelector = not char in set([".", "#", ",", "{", "\n", ":", "[", "/", "\\", '"'])

            if isSelector:
                selector_name = f"{selector_name}{char}"

            if char.isdigit():
                isSelector = False
                selector_name = ""
            
            # print(char, selector_name)
        else:
            selectors.add(selector_name.strip())
            selector_name = ""
            isSelector = char == symbol

    if "" in selectors:
        selectors.remove("")

    return list(selectors)

def getSelectors(css_text):
    num_tags = sum(css_text.count(tag) for tag in html_tags)

    css_classes = getSelectorsBySymbol(".", css_text)
    css_ids = getSelectorsBySymbol("#", css_text)

    return css_classes, css_ids, {
        "num_tags": num_tags,
        "num_classes": len(css_classes),
        "num_ids": len(css_ids),
        "num_media_queries": css_text.count("@media"),
        "num_keyframes": css_text.count("@keyframes")
    }

In [305]:
def removeEmptyString(theList):
        return [x for x in theList if x != ""]

def splitBySemiColon(styles):
    isInsideParenthesis = False
    splitStyles = []
    currentStyle = ""
    hasColon = False

    for i, char in enumerate(styles):
        if not isInsideParenthesis and (char == ";" or i == len(styles)-1):
            if  i == len(styles)-1:
                currentStyle = f"{currentStyle}{char}"
            
            # print(i, currentStyle, len(splitStyles), char == ";", i == len(styles)-1)

            if hasColon:
                splitStyles.append(currentStyle)
            else:
                try:
                    splitStyles[len(splitStyles)-1] += currentStyle
                except:
                    return []

            currentStyle = ""
            hasColon = False
            
            continue

        if char == ":":
            hasColon = True

        if char == "(":
            isInsideParenthesis = True
        elif char == ")":
            isInsideParenthesis = False

        currentStyle = f"{currentStyle}{char}"

    return splitStyles

def splitByColon(styles):
    return list(map(lambda style : removeEmptyString(style.split(":", 1)), styles))

def getStyles(css_text):
    css_text = removeComments(css_text)
    styles = re.findall(r'\{\s*(.*?)\s*\}', css_text, flags=re.DOTALL) # list of substrings between { and }
    # print("s", styles)

    stylesSplitSemicolon = list(map(splitBySemiColon, styles))
    # print(stylesSplitSemicolon)
    stylesSplitColon = list(map(splitByColon, stylesSplitSemicolon))

    return stylesSplitColon

In [270]:
test = "html{color:rgb(0,0,0);background:rgb(255,255,255) none repeat scroll 0% 0%}"
getStyles(test)

[[['color', 'rgb(0,0,0)'],
  ['background', 'rgb(255,255,255) none repeat scroll 0% 0']]]

In [271]:
class StyleSyntaxException(Exception):
    def __init__(self, prev, current):
        self.prev = prev
        self.current = current

def getNumStyles(stylesheet):
    style_attributes = {}
    style_values = {}

    def addStyleToDict(dictionary, style):
        if style in dictionary:
            # some files had an error where there would be a random semi-colon in the attribute value
            # e.g. color: red; !important; this adds it to the previous line
            dictionary[style] += 1
        else:
            dictionary[style] = 1

    for styles in stylesheet:
        try:
            y = None

            for x in styles:
                attr, value = x
                addStyleToDict(style_attributes, attr)
                addStyleToDict(style_values, value)
                y = x
        except:
            # print("prev", y)
            # print("err", x)
            raise StyleSyntaxException(y, x)

    return style_attributes, style_values

In [272]:
# with open('test.txt', 'w') as f:
#     f.write(rcssmin.cssmin(test_file))

In [306]:
css_texts = []
css_metadata = []
css_selectors = []
css_styles = []
file_errors = []
css_files_length = len(css_files)

for i, (css_file_path, css_minified_path) in enumerate(css_files):
    try:
        # print(css_minified_path)

        with open(css_file_path, "rb") as file:
            css_text = file.read().decode(errors="replace")

        with open(css_minified_path, "rb") as file:
            minified_css = file.read().decode(errors="replace")
        
        css_texts.append((css_text, minified_css))

        css_classes, css_ids, numSelectors = getSelectors(minified_css)

        styles = getStyles(minified_css.replace("\n", " "))

        css_selectors.append({
            "id": i,
            "cssFile": css_file_path,
            "minifiedFile": css_minified_path,
            "classes": css_classes,
            "ids": css_ids
        })

        css_styles.append({
            "id": i,
            "cssFile": css_file_path,
            "minifiedFile": css_minified_path,
            "styles": styles
        })

        metadata = {
            "id": i,
            "cssFile": css_file_path,
            "fileSize": getFileSize(css_file_path),
            "numLines": getNumLines(css_text),
            "numStyles": getNumStyles(styles)
        }

        metadata.update(numSelectors)

        css_metadata.append(metadata)
    except StyleSyntaxException as err:
        print(css_file_path, err.prev, err.current)
        file_errors.append(css_file_path)
        # print("err:", err)

    print(f"File: {i+1} / {css_files_length} | Number of Errors: {len(file_errors)}", end="\r")

print("\n", file_errors[:3])


File: 552 / 552 | Number of Errors: 0
 []


In [None]:
# print(css_metadata)
# print(css_texts[2])

In [None]:
css_files[4][0]

'original-dataset\\9292.nl\\results\\run2\\print.min.css'

In [None]:
x = "height: 50px"
x.split(";")

['height: 50px']

In [307]:
# getStyles(css_texts[2])
getStyles(css_texts[4][1])

[[['color', 'rgb(0,0,0)'],
  ['background', 'rgb(255,255,255) none repeat scroll 0% 0%']],
 [['margin', '0px'], ['padding', '0px']],
 [['border-collapse', 'collapse'], ['border-spacing', '0px']],
 [['border', '0px none']],
 [['font-style', 'normal'], ['font-weight', 'normal']],
 [['list-style', 'outside none none']],
 [['text-align', 'left']],
 [['font-size', '100%'], ['font-weight', 'normal']],
 [['content', '""']],
 [['border', '0px none'], ['font-variant', 'normal']],
 [['vertical-align', 'text-top']],
 [['vertical-align', 'text-bottom']],
 [['font-family', 'inherit'],
  ['font-size', 'inherit'],
  ['font-weight', 'inherit']],
 [['font-size', '100%']],
 [],
 [['font-family', '"FedraSansBookRegular"'],
  ['font-style', 'normal'],
  ['font-weight', 'normal']],
 [['font-family', '"FedraSansMedium"'],
  ['font-style', 'normal'],
  ['font-weight', 'normal']],
 [['height', '100%']],
 [['min-width', '994px']],
 [['font', '81.25%/1.125 Arial,Helvetica,sans-serif'],
  ['color', 'rgb(34,34,34

In [None]:
# getStyles(css_texts[5])

In [None]:
dictToJSONFile("./css-metadata.json", css_metadata)
dictToJSONFile("./css-selectors.json", css_selectors)
dictToJSONFile("./css-styles.json", css_styles)

In [None]:
css_metadata_df = pd.DataFrame.from_dict(css_metadata)
css_metadata_df

Unnamed: 0,id,cssFile,fileSize,numLines,numStyles,num_tags,num_classes,num_ids,num_media_queries,num_keyframes
0,0,original-dataset\9292.nl\index_files\print.min...,12203,1,"({'color': 8, 'margin': 11, 'border-collapse':...",2803,88,23,0,0
1,1,original-dataset\9292.nl\index_files\styles.mi...,172314,9,"({'position': 106, 'clip': 3, 'margin': 39, 'p...",39138,1019,75,0,0
2,2,original-dataset\9292.nl\results\run1\print.mi...,13791,159,"({'color': 7, 'margin': 11, 'border-collapse':...",2854,87,19,0,0
3,3,original-dataset\9292.nl\results\run1\styles.m...,194633,1949,"({'position': 106, 'margin': 40, 'padding': 93...",39688,999,32,0,0
4,4,original-dataset\9292.nl\results\run2\print.mi...,13791,159,"({'color': 7, 'margin': 11, 'border-collapse':...",2854,87,19,0,0
...,...,...,...,...,...,...,...,...,...,...
547,547,original-dataset\Wordpress.org\results\run1\b0...,35651,183,"({'font-family': 3, 'font-style': 2, 'font-wei...",4073,169,1,0,0
548,548,original-dataset\Wordpress.org\results\run2\30...,105,3,"({}, {})",23,1,2,0,0
549,549,original-dataset\Wordpress.org\results\run2\54...,31796,309,"({'margin': 23, 'font-style': 3, 'font-size': ...",6219,126,62,7,0
550,550,original-dataset\Wordpress.org\results\run2\9f...,1681,37,"({'font-family': 6, 'font-style': 6, 'font-wei...",257,2,0,0,0


In [None]:
ProfileReport(css_metadata_df, title="CSS Metadata Report", explorative=True).to_file("./profile-reports/css-metadata-report.html")

Summarize dataset: 100%|██████████| 87/87 [00:06<00:00, 13.40it/s, Completed]                                   
Generate report structure: 100%|██████████| 1/1 [00:01<00:00,  1.66s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.25s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 90.91it/s]


In [None]:
css_selectors_df = pd.DataFrame.from_dict(css_selectors)
css_selectors_df

Unnamed: 0,id,cssFile,minifiedFile,classes,ids
0,0,original-dataset\9292.nl\index_files\print.min...,minified-dataset\9292.nl\index_files\print.min...,"[four-cols figure, hidden, divided, bold, main...","[ui-datepicker-div, FFF}body, transport-on-req..."
1,1,original-dataset\9292.nl\index_files\styles.mi...,minified-dataset\9292.nl\index_files\styles.mi...,"[ns-operator-icon, vert-rect, secondary-button...","[advice-section p, fff url(bg-main-right-col, ..."
2,2,original-dataset\9292.nl\results\run1\print.mi...,minified-dataset\9292.nl\results\run1\print.mi...,"[four-cols figure, hidden, divided, bold, main...","[ui-datepicker-div, transport-on-request ul li..."
3,3,original-dataset\9292.nl\results\run1\styles.m...,minified-dataset\9292.nl\results\run1\styles.m...,"[ns-operator-icon, vert-rect, secondary-button...","[advice-section p, cookie-consent-popup p, reg..."
4,4,original-dataset\9292.nl\results\run2\print.mi...,minified-dataset\9292.nl\results\run2\print.mi...,"[four-cols figure, hidden, divided, bold, main...","[ui-datepicker-div, transport-on-request ul li..."
...,...,...,...,...,...
547,547,original-dataset\Wordpress.org\results\run1\b0...,minified-dataset\Wordpress.org\results\run1\b0...,"[dashicons-format-status, dashicons-welcome-vi...",[dashicons]
548,548,original-dataset\Wordpress.org\results\run2\30...,minified-dataset\Wordpress.org\results\run2\30...,[current],"[wporg-header ul li, wporg-header ul li a]"
549,549,original-dataset\Wordpress.org\results\run2\54...,minified-dataset\Wordpress.org\results\run2\54...,"[community, download-button-disabled, publishe...","[download-mobile span, home-welcome img, showc..."
550,550,original-dataset\Wordpress.org\results\run2\9f...,minified-dataset\Wordpress.org\results\run2\9f...,"[googleusercontent, ttf]",[]


In [None]:
ProfileReport(css_selectors_df, title="CSS Selectors Report", explorative=True).to_file("./profile-reports/css-selectors-report.html")

Summarize dataset: 100%|██████████| 19/19 [00:00<00:00, 65.07it/s, Completed]                     
Generate report structure: 100%|██████████| 1/1 [00:01<00:00,  1.08s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  9.62it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 333.28it/s]
