In [1]:
import os
import shutil
import re
import json
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport
import rcssmin

In [2]:
def getFileByExtension(extension_name):
    if os.path.isdir("minified-dataset"):
        shutil.rmtree("minified-dataset", ignore_errors=True)

    os.mkdir("minified-dataset")
    
    valid_files = []

    for filePath, dirs, files in os.walk("original-dataset"):
        minifiedStartingPath = filePath.replace("original-dataset", "")

        for directory in dirs:
            os.mkdir(f"minified-dataset\\{minifiedStartingPath}\\{directory}")

        for file in files:
            if file[-len(extension_name):] == extension_name:
                valid_files.append((os.path.join(filePath, file), 
                    os.path.join(f"minified-dataset{minifiedStartingPath}", file)))

                with open(os.path.join(filePath, file), "rb") as f:
                    file_text = f.read().decode(errors="replace")

                with open(f"minified-dataset\\{minifiedStartingPath}\\{file}", "w") as f:
                    f.write(rcssmin.cssmin(file_text))
                

        # print(filePath)
        # print(dirs)
        # print(files)

    return valid_files


In [3]:
os.getcwd()

'c:\\Users\\mattg_3roa89k\\Documents\\Uni\\CSS1\\AT3\\css-refactor-tool'

In [4]:
css_files = getFileByExtension(".css")
# print(css_files)
print(len(css_files))
css_files[0]

552


('original-dataset\\9292.nl\\index_files\\print.min.css',
 'minified-dataset\\9292.nl\\index_files\\print.min.css')

In [5]:
with open("original-dataset\\GlobalTVBC\\css\\23579edb1e12f906d2b83f522d1c27011dc1b1d8.css", "rb") as file:
    test_file = file.read().decode(errors="replace")

### EDA Questions:

- File sizes
- Number of lines
- Number of selectors (tags, classes, ids)
- Number of media queries
- Number of keyframes
- Number of styles
- Number of styles per selector
- Number of valid styles


In [6]:
def dictToJSONFile(filePath, dictionary):
    with open(filePath, 'w') as fp:
        json.dump(dictionary, fp, indent=2)

Causes of errors:
- Invalid CSS (e.g. original-dataset\Alibaba\css\fe397ac3e7d34d455bd1114d623b1e62aa8ee51c.css)
    - filter: (filter attribute has no value and semicolon)
- Using base64 image backgrounds (e.g. original-dataset\Amazon.ca\css\d96313390c86b5833d30fbe6a5ed905fd3a0ea05.css)
    - the semi colon in the base64 image string causes the url to be seperated: background-image:url(data:image/gif;base64,R0lGODlhBwAKAMIEAObm5uvr6/Dw8P39/f///////////////yH5BAEKAAcALAAAAAAHAAoAAAMWSDPUGoE5AaIj1M4qMW+ZFDYD1ClnAgA7);

In [7]:
re.escape("/*")

'/\\*'

In [8]:
with open("./html_tags.json", "r") as f:
  html_tags = json.load(f)

def getFileSize(filePath):
    return os.path.getsize(filePath)

def getNumLines(css_text):
    return css_text.count("\n")+1

def removeComments(css_text):
    escaped_backlash = re.escape("\n")
    css_text = re.sub(f'{re.escape("/*")}.*?{re.escape("*/")}', '', css_text)
    css_text = re.sub(f'{re.escape("//")}.*?{escaped_backlash}', '', css_text)

    return css_text

def getSelectorsBySymbol(symbol, css_text):
    css_text = removeComments(css_text)

    selectors = set([])

    isSelector = False
    selector_name = ""

    for char in css_text:
        if isSelector:
            isSelector = not char in set([" ", ".", "#", ",", "{", "\n", ":", "[", "/", "\\", 
                '"', ">", "+", "~"])

            if isSelector:
                selector_name = f"{selector_name}{char}"

            if char.isdigit():
                isSelector = False
                selector_name = ""
            
            # print(char, selector_name)
        else:
            selectors.add(selector_name.strip())
            selector_name = ""
            isSelector = char == symbol

    if "" in selectors:
        selectors.remove("")

    return list(selectors)

def getSelectors(css_text):
    num_tags = sum(css_text.count(tag) for tag in html_tags)

    css_classes = getSelectorsBySymbol(".", css_text)
    css_ids = getSelectorsBySymbol("#", css_text)

    return css_classes, css_ids, {
        "num_tags": num_tags,
        "num_classes": len(css_classes),
        "num_ids": len(css_ids),
        "num_rules": css_text.count("@"),
        "num_media_queries": css_text.count("@media"),
        "num_keyframes": css_text.count("@keyframes")
    }

In [9]:
def removeEmptyString(theList):
        return [x for x in theList if x != ""]

def splitBySemiColon(styles):
    isInsideParenthesis = False
    splitStyles = []
    currentStyle = ""
    hasColon = False

    for i, char in enumerate(styles):
        if not isInsideParenthesis and (char == ";" or i == len(styles)-1):
            if  i == len(styles)-1:
                currentStyle = f"{currentStyle}{char}"
            
            # print(i, currentStyle, len(splitStyles), char == ";", i == len(styles)-1)

            if hasColon:
                splitStyles.append(currentStyle)
            else:
                try:
                    splitStyles[len(splitStyles)-1] += currentStyle
                except:
                    return []

            currentStyle = ""
            hasColon = False
            
            continue

        if char == ":":
            hasColon = True

        if char == "(":
            isInsideParenthesis = True
        elif char == ")":
            isInsideParenthesis = False

        currentStyle = f"{currentStyle}{char}"

    return splitStyles

def splitByColon(styles):
    return list(map(lambda style : removeEmptyString(style.split(":", 1)), styles))

def getStyles(css_text):
    css_text = removeComments(css_text)
    styles = re.findall(r'\{\s*(.*?)\s*\}', css_text, flags=re.DOTALL) # list of substrings between { and }
    # print("s", styles)

    stylesSplitSemicolon = list(map(splitBySemiColon, styles))
    # print(stylesSplitSemicolon)
    stylesSplitColon = list(map(splitByColon, stylesSplitSemicolon))

    return stylesSplitColon

In [10]:
test = "html{color:rgb(0,0,0);background:rgb(255,255,255) none repeat scroll 0% 0%}"
getStyles(test)

[[['color', 'rgb(0,0,0)'],
  ['background', 'rgb(255,255,255) none repeat scroll 0% 0%']]]

In [11]:
class StyleSyntaxException(Exception):
    def __init__(self, prev, current):
        self.prev = prev
        self.current = current

def getNumStyles(stylesheet):
    style_attributes = {}
    style_values = {}

    def addStyleToDict(dictionary, style):
        if style in dictionary:
            # some files had an error where there would be a random semi-colon in the attribute value
            # e.g. color: red; !important; this adds it to the previous line
            dictionary[style] += 1
        else:
            dictionary[style] = 1

    for styles in stylesheet:
        try:
            y = None

            for x in styles:
                attr, value = x
                addStyleToDict(style_attributes, attr)
                addStyleToDict(style_values, value)
                y = x
        except:
            # print("prev", y)
            # print("err", x)
            raise StyleSyntaxException(y, x)

    return style_attributes, style_values

In [12]:
# with open('test.txt', 'w') as f:
#     f.write(rcssmin.cssmin(test_file))

In [13]:
css_texts = []
css_metadata = []
css_selectors = []
css_styles = []
file_errors = []
css_files_length = len(css_files)

for i, (css_file_path, css_minified_path) in enumerate(css_files):
    try:
        # print(css_minified_path)

        with open(css_file_path, "rb") as file:
            css_text = file.read().decode(errors="replace")

        with open(css_minified_path, "rb") as file:
            minified_css = file.read().decode(errors="replace")
        
        css_texts.append((css_text, minified_css))

        css_classes, css_ids, numSelectors = getSelectors(minified_css)

        styles = getStyles(minified_css.replace("\n", " "))

        css_selectors.append({
            "id": i,
            "cssFile": css_file_path,
            "minifiedFile": css_minified_path,
            "classes": css_classes,
            "ids": css_ids
        })

        css_styles.append({
            "id": i,
            "cssFile": css_file_path,
            "minifiedFile": css_minified_path,
            "styles": styles
        })

        metadata = {
            "id": i,
            "cssFile": css_file_path,
            "fileSize": getFileSize(css_file_path),
            "numLines": getNumLines(css_text),
            "numStyles": getNumStyles(styles)
        }

        metadata.update(numSelectors)

        css_metadata.append(metadata)
    except StyleSyntaxException as err:
        print(css_file_path, err.prev, err.current)
        file_errors.append(css_file_path)
        # print("err:", err)

    print(f"File: {i+1} / {css_files_length} | Number of Errors: {len(file_errors)}", end="\r")

print("\n", file_errors[:3])


File: 552 / 552 | Number of Errors: 0
 []


In [14]:
# print(css_metadata)
# print(css_texts[2])

In [15]:
css_files[4][0]

'original-dataset\\9292.nl\\results\\run2\\print.min.css'

In [16]:
x = "height: 50px"
x.split(";")

['height: 50px']

In [17]:
# getStyles(css_texts[2])
getStyles(css_texts[4][1])

[[['color', 'rgb(0,0,0)'],
  ['background', 'rgb(255,255,255) none repeat scroll 0% 0%']],
 [['margin', '0px'], ['padding', '0px']],
 [['border-collapse', 'collapse'], ['border-spacing', '0px']],
 [['border', '0px none']],
 [['font-style', 'normal'], ['font-weight', 'normal']],
 [['list-style', 'outside none none']],
 [['text-align', 'left']],
 [['font-size', '100%'], ['font-weight', 'normal']],
 [['content', '""']],
 [['border', '0px none'], ['font-variant', 'normal']],
 [['vertical-align', 'text-top']],
 [['vertical-align', 'text-bottom']],
 [['font-family', 'inherit'],
  ['font-size', 'inherit'],
  ['font-weight', 'inherit']],
 [['font-size', '100%']],
 [],
 [['font-family', '"FedraSansBookRegular"'],
  ['font-style', 'normal'],
  ['font-weight', 'normal']],
 [['font-family', '"FedraSansMedium"'],
  ['font-style', 'normal'],
  ['font-weight', 'normal']],
 [['height', '100%']],
 [['min-width', '994px']],
 [['font', '81.25%/1.125 Arial,Helvetica,sans-serif'],
  ['color', 'rgb(34,34,34

In [37]:
totalProperties = {}
totalValues = {}
totalStyles = {}

numPropRepititionsPerFile = np.array([])
numValueRepititionsPerFile = np.array([])

def addToDictCount(combinedDict, oldDict):
  for key in oldDict:
    if key in combinedDict:
      combinedDict[key] += oldDict[key]
    else:
      combinedDict[key] = oldDict[key]

for metadata in css_metadata:
  properties, values = metadata["numStyles"]

  numPropRepititionsPerFile = np.append(numPropRepititionsPerFile, sum(list(properties.values())) - len(properties))
  numValueRepititionsPerFile = np.append(numValueRepititionsPerFile, sum(list(values.values())) - len(values))

  addToDictCount(totalProperties, properties)
  addToDictCount(totalValues, values)

for styles in css_styles:
  styles = styles["styles"]
  
  for selectorStyles in styles:
    for style in selectorStyles:
      style = f"{style[0]}:{style[1]}"

      if style in totalStyles:
        totalStyles[style] += 1
      else:
        totalStyles[style] = 1

totalProperties = {
  "property": list(totalProperties.keys()),
  "frequency": list(totalProperties.values())
}

totalValues = {
  "values": list(totalValues.keys()),
  "frequency": list(totalValues.values())
}

totalStyles = {
  "styles": list(totalStyles.keys()),
  "frequency": list(totalStyles.values())
}



In [35]:
print(round(numPropRepititionsPerFile.mean(), 2))
print(round(numValueRepititionsPerFile.mean(), 2))
print()
print(round(np.median(numPropRepititionsPerFile), 2))
print(round(np.median(numValueRepititionsPerFile), 2))
print()
print(round(np.quantile(numPropRepititionsPerFile, 0.95), 2))
print(round(np.quantile(numValueRepititionsPerFile, 0.95), 2))
print()
print(round(np.std(numPropRepititionsPerFile), 2))
print(round(np.std(numValueRepititionsPerFile), 2))

1211.77
949.89

400.5
278.5

5019.7
4098.3

1755.85
1449.38


In [38]:
totalStyles

{'styles': ['color:#000',
  'background:#FFF',
  'margin:0',
  'padding:0',
  'border-collapse:collapse',
  'border-spacing:0',
  'border:0',
  'font-style:normal',
  'font-weight:normal',
  'list-style:none',
  'text-align:left',
  'font-size:100%',
  "content:''",
  'font-variant:normal',
  'vertical-align:text-top',
  'vertical-align:text-bottom',
  'font-family:inherit',
  'font-size:inherit',
  'font-weight:inherit',
  "font-family:'FedraSansBookRegular'",
  'src:url("/static/fonts/fesabo__-webfont.eot")',
  'src:url("/static/fonts/fesabo__-webfont.eot?#iefix") format("embedded-opentype"),url("/static/fonts/fesabo__-webfont.woff") format("woff"),url("/static/fonts/fesabo__-webfont.ttf") format("truetype"),url("/static/fonts/fesabo__-webfont.svg#FedraSansBookRegular") format("svg")',
  "font-family:'FedraSansMedium'",
  'src:url("/static/fonts/fesamd__.eot")',
  'src:url("/static/fonts/fesamd__.eot?#iefix") format("embedded-opentype"),url("/static/fonts/fesamd__.woff") format("woff

In [40]:
numStyleRepititions = np.array(totalStyles["frequency"])
print(round(numStyleRepititions.mean(), 2))
print(round(np.median(numStyleRepititions), 2))
print(round(np.quantile(numStyleRepititions, 0.95), 2))
print(round(np.std(numStyleRepititions), 2))

15.86
2.0
30.0
198.7


In [47]:
styleAndFreq = np.vstack((totalStyles["styles"], totalStyles["frequency"])).T
styleAndFreq

array([['color:#000', '520'],
       ['background:#FFF', '43'],
       ['margin:0', '2460'],
       ...,
       ['content:"%uF471"', '2'],
       ['content:"%uF470"', '2'],
       ['content:"%uF328"', '2']], dtype='<U26009')

In [58]:
pd.DataFrame(sorted(styleAndFreq.tolist(), key=lambda x: int(x[1]), reverse=True)[:10], columns=["Style", "Frequency"])

Unnamed: 0,Style,Frequency
0,display:block,15871
1,position:absolute,13847
2,float:left,13678
3,position:relative,12453
4,display:none,10267
5,overflow:hidden,9113
6,display:inline-block,7892
7,font-weight:bold,7769
8,margin:0px,7615
9,padding:0px,7142


In [42]:
totalPropertiesDf = pd.DataFrame.from_dict(totalProperties)
totalValuesDf = pd.DataFrame.from_dict(totalValues)
totalStylesDf = pd.DataFrame.from_dict(totalStyles)

In [44]:
ProfileReport(totalPropertiesDf, title="Style Properties Report", explorative=True).to_file("./profile-reports/style-properties-report.html")
ProfileReport(totalValuesDf, title="Style values Report", explorative=True).to_file("./profile-reports/style-values-report.html")
ProfileReport(totalStylesDf, title="Freq Styles Report", explorative=True).to_file("./profile-reports/styles-freq-report.html")

Summarize dataset: 100%|██████████| 16/16 [00:00<00:00, 67.13it/s, Completed]                     
Generate report structure: 100%|██████████| 1/1 [00:00<00:00,  2.27it/s]
Render HTML: 100%|██████████| 1/1 [00:00<00:00, 13.62it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 500.75it/s]
Summarize dataset: 100%|██████████| 16/16 [00:02<00:00,  5.47it/s, Completed]                    
Generate report structure: 100%|██████████| 1/1 [00:00<00:00,  2.18it/s]
Render HTML: 100%|██████████| 1/1 [00:00<00:00, 12.98it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 500.22it/s]
Summarize dataset: 100%|██████████| 16/16 [00:04<00:00,  3.58it/s, Completed]                    
Generate report structure: 100%|██████████| 1/1 [00:00<00:00,  2.22it/s]
Render HTML: 100%|██████████| 1/1 [00:00<00:00, 12.97it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 500.27it/s]


In [25]:
dictToJSONFile("./css-metadata.json", css_metadata)
dictToJSONFile("./css-selectors.json", css_selectors)
dictToJSONFile("./css-styles.json", css_styles)

In [26]:
css_metadata_df = pd.DataFrame.from_dict(css_metadata)
# css_metadata_df

In [27]:
ProfileReport(css_metadata_df, title="CSS Metadata Report", explorative=True).to_file("./profile-reports/css-metadata-report.html")

Summarize dataset: 100%|██████████| 105/105 [00:07<00:00, 13.41it/s, Completed]                                  
Generate report structure: 100%|██████████| 1/1 [00:01<00:00,  1.88s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.65s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 71.43it/s]


In [28]:
css_selectors_df = pd.DataFrame.from_dict(css_selectors)
# css_selectors_df

In [29]:
ProfileReport(css_selectors_df, title="CSS Selectors Report", explorative=True).to_file("./profile-reports/css-selectors-report.html")

Summarize dataset: 100%|██████████| 19/19 [00:00<00:00, 63.77it/s, Completed]                     
Generate report structure: 100%|██████████| 1/1 [00:01<00:00,  1.15s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00, 10.10it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 992.26it/s]
