In [2]:
# Data handling
import numpy as np
import pandas as pd
import re

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker
%matplotlib inline

#lib
from lib.clean_scrap_functions import split_url, clean_ext_publisher, clean_repo_publisher
# import lib.github_validator as gh_validator

#Env
from dotenv import load_dotenv

# os
import os

# time
import time

# warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
load_dotenv()
api_key = os.getenv("GITHUB_API_KEY_API_KEY")

In [4]:
import yaml

try:
    with open("../config.yaml", "r") as file:
        config = yaml.safe_load(file)
except:
    print("Yaml configuration file not found!")

### 1. Loading

In [None]:
df_scraped = pd.read_csv(config["data"]["raw"]["file_scraped"])
df_verified = pd.read_csv(config["data"]["raw"]["file_verified"])
df_vulnerable = pd.read_csv(config["data"]["raw"]["file_vulnerable"])
df_scraped.head()

In [None]:
# Check dimensions
raw_files = {"scraped": df_scraped, "verified": df_verified, "vulnerable": df_vulnerable}
for key, val in raw_files.items():
        print (f"Dimension of file '{key}': {val.shape}")


In [None]:
# Check total null values
for key, val in raw_files.items():
        print (f"Null values of '{key}': {val.isna().sum()}")
        print()

In [None]:
# Check total duplicates
for key, val in raw_files.items():
    print(f"{key}")
    for col in val.columns:
        print (f"Duplicated in column '{col}' of '{key}': {val[col].duplicated().sum()}")
    print()    

In [None]:
#Check total unique values
for key, val in raw_files.items():
    print(f"{key}")
    for col in val.columns:
        print (f"Total unique values in '{col}' of '{key}': {val[col].nunique()}")
    print()    

### 2. Cleaning

#### Clean column names and remove columns

In [10]:
raw_files = {"scraped": df_scraped, "verified": df_verified, "vulnerable": df_vulnerable}
for val in raw_files.values():
    val.columns = val.columns.str.strip().str.lower().str.replace(" ", "_")

In [11]:
df_scraped = df_scraped.drop(["id", "description"], axis = 1).add_prefix("ext_") #avoid overfitting for model training, add prefix to cols
df_verified = df_verified.drop(["install_count"],axis = 1).rename(columns = {"extension_name":"ext_name", "publisher": "repo_publisher", "source_code":"repository"}) # install_count of extension in this df not updated
df_vulnerable = df_vulnerable.drop(["repository_name","critical_vulnerability_names", "high_vulnerability_names", "medium_vulnerability_names", "low_vulnerability_names"],axis = 1).rename(columns = {"extension_name": "ext_name", "repository_link":"repository"})


In [None]:
df_scraped.columns, df_verified.columns, df_vulnerable.columns

In [13]:
# Define some dicts for cleaning

duplicated_subsets = {"scraped": ["ext_name","ext_publisher","ext_version","ext_last_updated"],
                      "verified":["ext_name","repo_publisher","repository"],
                      "vulnerable": ["ext_name","repository"],
                     }

null_subsets = {"scraped": ["ext_publisher"],
                "verified": [],
                "vulnerable": ["ext_name"],
               }

#### Drop duplicated after scraping

In [None]:
# Check total duplicates
# Check dimensions
raw_files = {"scraped": df_scraped, "verified": df_verified, "vulnerable": df_vulnerable}
for key, val in raw_files.items():
    print(f"{key}")
    for col in val.columns:
        print (f"Duplicated in column '{col}' of '{key}': {val[col].duplicated().sum()}")
    print()    

In [None]:
df_scraped.drop_duplicates(subset=duplicated_subsets["scraped"])

In [None]:
df_scraped[df_scraped["ext_name"] == "C/C++ Extension Pack"]

In [None]:
df_verified[df_verified["ext_name"] == "C/C++ Extension Pack"]

In [None]:
df_vulnerable[df_vulnerable["ext_name"] == "C/C++ Extension Pack"]

In [19]:
# df_scraped[df_scraped.duplicated(subset=["name","publisher","version"])]
# df_verified[df_verified.duplicated(subset=["Extension Name","Publisher","Install Count"])]
# df_vulnerable[df_vulnerable.duplicated(subset=["Extension Name","Repository Link"])]
df_scraped = df_scraped.drop_duplicates(subset=duplicated_subsets["scraped"])
df_verified = df_verified.drop_duplicates(subset=duplicated_subsets["verified"])
df_vulnerable = df_vulnerable.drop_duplicates(subset=duplicated_subsets["vulnerable"])

In [None]:
# Check total duplicates
# Check dimensions
raw_files = {"scraped": df_scraped, "verified": df_verified, "vulnerable": df_vulnerable}
for key, val in raw_files.items():
    print(f"{key}")
    for col in val.columns:
        print (f"Duplicated in column '{col}' of '{key}': {val[col].duplicated().sum()}")
    print()
    print(f"Duplicated in '{key}' with subset '{duplicated_subsets[key]}': {val.duplicated(subset= duplicated_subsets[key]).sum()} ")
    print() 

#### Check null values and drop null

In [None]:
# Check dimensions
raw_files = {"scraped": df_scraped, "verified": df_verified, "vulnerable": df_vulnerable}
for key, val in raw_files.items():
        print (f"Dimension of file '{key}': {val.shape}")

In [None]:
# Check total null values
raw_files = {"scraped": df_scraped, "verified": df_verified, "vulnerable": df_vulnerable}
for key, val in raw_files.items():
        print (f"Null values of '{key}': {val.isna().sum()}")
        print()

In [23]:
df_scraped = df_scraped.dropna(subset=null_subsets["scraped"], how="all")

Not to drop null values in repository.It can be derived by ext_name, repo_publisher. Will look after it after merging  (to be done) 

In [24]:
# df_verified[~((df_verified["repository"].str.contains("https://git|https://www.git|github.com",na=False,regex=True)) | (df_verified["repository"].isna()))]

In [25]:
# df_vulnerable = df_vulnerable[~df_vulnerable["ext_name"].isna()]
df_vulnerable = df_vulnerable.dropna(subset= null_subsets["vulnerable"], how= "all")

In [None]:
df_vulnerable.info()

Map value in ext_publisher vs repo_publisher

In [None]:
df_scraped["ext_publisher"].value_counts()

In [None]:
df_verified["repo_publisher"].value_counts()

In [29]:
df_scraped["merge_publisher"] = df_scraped["ext_publisher"].apply(clean_ext_publisher)
df_verified["merge_publisher"] = df_verified["repo_publisher"].apply(clean_repo_publisher)

In [None]:
df_scraped["ext_publisher"].nunique()

#### Combine raw df after cleaning 

In [None]:
df_scraped.head()

In [None]:
df_verified.head()

In [None]:
df_vulnerable.head()

In [34]:
# df_scraped = df_scraped[df_scraped["ext_rating"]>0]
df_ver_vul = pd.merge(df_verified,df_vulnerable, on= ["ext_name", "repository"], how="left")


In [None]:
df_ver_vul.shape

In [36]:
# df_verified[df_verified["ext_name"] == "C/C++ Extension Pack"]
# df_vulnerable[df_vulnerable["ext_name"] == "C/C++ Extension Pack"]
# df_ver_vul[df_ver_vul["ext_name"] == "C/C++ Extension Pack"]

In [37]:
df_clean = pd.merge(df_scraped, df_ver_vul, left_on=["ext_name","merge_publisher"], right_on=["ext_name","merge_publisher"], how= "inner")

In [None]:
df_clean

In [39]:
# df_scraped[df_scraped ["ext_name"] == "C/C++ Extension Pack"]
# df_ver_vul[df_ver_vul ["ext_name"] == "C/C++ Extension Pack"]
# df_clean[df_clean ["ext_name"] == "C/C++ Extension Pack"]

#### Clean Git Url

In [40]:
df_clean = df_clean[((df_clean["repository"].str.contains("https://git|https://www.git|github.com",na=False,regex=True)) | (df_clean["repository"].isna()))]
df_clean["repository"] = df_clean["repository"].apply(split_url)
df_clean.to_csv(config["data"]["clean"]["file_scrap_cleaned"],index=False )

In [None]:
#
df_clean[((df_clean["repository"].str.contains("https://git|https://www.git|github.com",na=False,regex=True)))]

In [None]:
df_clean[df_clean["repository"].isna()]

In [None]:
df_raw = pd.read_csv(config["data"]["raw"]["file_ext_repo"])
df_raw.shape

scrap_cleaned -> repos_full