In [138]:
import os
import json
import re
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport

In [139]:
with open("./html_tags.json", "r") as f:
  html_tags = json.load(f)

def getFileSize(filePath):
    return os.path.getsize(filePath)

def getNumLines(css_text):
    return css_text.count("\n")+1

def removeComments(css_text):
    escaped_backlash = re.escape("\n")
    css_text = re.sub(f'{re.escape("/*")}.*?{re.escape("*/")}', '', css_text)
    css_text = re.sub(f'{re.escape("//")}.*?{escaped_backlash}', '', css_text)

    return css_text

def getSelectorsBySymbol(symbol, css_text):
    css_text = removeComments(css_text)

    selectors = set([])

    isSelector = False
    selector_name = ""

    for char in css_text:
        if isSelector:
            isSelector = not char in set([" ", ".", "#", ",", "{", "\n", ":", "[", "/", "\\", 
                '"', ">", "+", "~"])

            if isSelector:
                selector_name = f"{selector_name}{char}"

            if char.isdigit():
                isSelector = False
                selector_name = ""
            
            # print(char, selector_name)
        else:
            selectors.add(selector_name.strip())
            selector_name = ""
            isSelector = char == symbol

    if "" in selectors:
        selectors.remove("")

    return list(selectors)

def getSelectors(css_text):
    num_tags = sum(css_text.count(tag) for tag in html_tags)

    css_classes = getSelectorsBySymbol(".", css_text)
    css_ids = getSelectorsBySymbol("#", css_text)

    return css_classes, css_ids, {
        "num_tags": num_tags,
        "num_classes": len(css_classes),
        "num_ids": len(css_ids),
        "num_rules": css_text.count("@"),
        "num_media_queries": css_text.count("@media"),
        "num_keyframes": css_text.count("@keyframes")
    }

In [140]:
def removeEmptyString(theList):
        return [x for x in theList if x != ""]

def splitBySemiColon(styles):
    isInsideParenthesis = False
    splitStyles = []
    currentStyle = ""
    hasColon = False

    for i, char in enumerate(styles):
        if not isInsideParenthesis and (char == ";" or i == len(styles)-1):
            if  i == len(styles)-1:
                currentStyle = f"{currentStyle}{char}"
            
            # print(i, currentStyle, len(splitStyles), char == ";", i == len(styles)-1)

            if hasColon:
                splitStyles.append(currentStyle)
            else:
                try:
                    splitStyles[len(splitStyles)-1] += currentStyle
                except:
                    return []

            currentStyle = ""
            hasColon = False
            
            continue

        if char == ":":
            hasColon = True

        if char == "(":
            isInsideParenthesis = True
        elif char == ")":
            isInsideParenthesis = False

        currentStyle = f"{currentStyle}{char}"

    return splitStyles

# def splitByColon(styles):
#     return list(map(lambda style : removeEmptyString(style.split(":", 1)), styles))

def getStyles(css_text):
    css_text = removeComments(css_text)
    styles = re.findall(r'\{\s*(.*?)\s*\}', css_text, flags=re.DOTALL) # list of substrings between { and }
    # print("s", styles)

    stylesSplitSemicolon = list(map(splitBySemiColon, styles))
    # print(stylesSplitSemicolon)
    # stylesSplitColon = list(map(splitByColon, stylesSplitSemicolon))

    return stylesSplitSemicolon

In [141]:
test = "html{color:rgb(0,0,0);background:rgb(255,255,255) none repeat scroll 0% 0%}"
getStyles(test)

[['color:rgb(0,0,0)', 'background:rgb(255,255,255) none repeat scroll 0% 0%']]

In [142]:
with open("original-dataset\\GlobalTVBC\\css\\23579edb1e12f906d2b83f522d1c27011dc1b1d8.css", "rb") as file:
    test_file = file.read().decode(errors="replace")

In [143]:
def getFileStats(filePath, text):
    num_selectors = getSelectors(text)[2]
    return [getFileSize(filePath), num_selectors["num_tags"], num_selectors["num_classes"], num_selectors["num_ids"]]

In [144]:
getFileStats("original-dataset\\GlobalTVBC\\css\\23579edb1e12f906d2b83f522d1c27011dc1b1d8.css", test_file)

[25850, 5483, 69, 33]

In [145]:
originalStats = []
refactorStats = []

refactoredStyles = []
originalStyles = []

numErrors = 0

for refactored_file in os.listdir("refactored-files"):
    try:
        if refactored_file.endswith(".css"):
            original_file = refactored_file.replace("-", "\\")
            original_file = original_file.replace("\\", "-", 1)
            refactored_file = f"refactored-files\\{refactored_file}"
            # print(refactored_file)
            # print(original_file)

            with open(refactored_file, "rb") as file:
                refactored_text = file.read().decode(errors="replace")

            with open(original_file, "rb") as file:
                original_text = file.read().decode(errors="replace")

            refactoredStyles.extend(getStyles(refactored_text))
            originalStyles.extend(getStyles(original_text))

            originalStats.append(getFileStats(original_file, original_text))
            refactorStats.append(getFileStats(refactored_file, refactored_text))
    except:
        numErrors += 1

refactoredStyles = [j for sub in refactoredStyles for j in sub]
originalStyles = [j for sub in originalStyles for j in sub]

In [146]:
print(numErrors)

4


In [147]:
def getDescriptiveStats(df):
    stats = []

    for column in ["File Size", "Number of Tags", "Number of Classes", "Number of Ids"]:
        stats.append([round(df[column].mean(), 2), round(df[column].median(), 2)])

    return stats

In [148]:
originalStatsDf = pd.DataFrame(originalStats, columns=["File Size", "Number of Tags", "Number of Classes", "Number of Ids"])
originalStatsDf

Unnamed: 0,File Size,Number of Tags,Number of Classes,Number of Ids
0,172072,39138,822,63
1,46176,10325,185,114
2,1078,196,5,0
3,1078,196,5,0
4,1078,196,5,0
...,...,...,...,...
85,9306,2050,59,6
86,3562,710,20,2
87,13464,2836,43,29
88,12874,2728,43,19


In [149]:
refactoredStatsDf = pd.DataFrame(refactorStats, columns=["File Size", "Number of Tags", "Number of Classes", "Number of Ids"])
refactoredStatsDf

Unnamed: 0,File Size,Number of Tags,Number of Classes,Number of Ids
0,264055,60396,823,41
1,66715,14795,185,107
2,1200,213,5,0
3,1200,213,5,0
4,1200,213,5,0
...,...,...,...,...
85,12100,2700,59,6
86,4747,1043,20,2
87,16375,3375,43,27
88,16311,3432,43,19


In [150]:
getDescriptiveStats(originalStatsDf)

[[9417.4, 5053.5], [1982.62, 1044.5], [58.22, 25.0], [9.78, 2.0]]

In [151]:
getDescriptiveStats(refactoredStatsDf)

[[13234.9, 6441.0], [2734.79, 1278.0], [58.26, 25.5], [9.2, 2.0]]

In [153]:
ProfileReport(pd.DataFrame(refactoredStyles, columns=["Styles"]), title="Refactored Styles Report", 
    explorative=True).to_file("./profile-reports/refactored-styles.html")

ProfileReport(pd.DataFrame(originalStyles, columns=["Styles"]), title="Original Styles Report", 
    explorative=True).to_file("./profile-reports/original-styles.html")

NameError: name 'ProfileReport' is not defined