In [1]:
# Data handling
import numpy as np
import pandas as pd
import re

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker
%matplotlib inline

#lib
from lib.clean_scrap_functions import split_url, clean_ext_publisher, clean_repo_publisher
import lib.github_validator as gh_validator

#Env
from dotenv import load_dotenv

# os
import os

# time
import time

# warnings
import warnings
warnings.filterwarnings('ignore')

  - '(\w+://)(.+@)*([\w\d\.]+)(:[\d]+){0,1}/*(.*)'


In [2]:
load_dotenv()
api_key = os.getenv("GITHUB_API_KEY_API_KEY")

In [3]:
import yaml

try:
    with open("../config.yaml", "r") as file:
        config = yaml.safe_load(file)
except:
    print("Yaml configuration file not found!")

### 1. Loading

In [4]:
df_scraped = pd.read_csv(config["data"]["raw"]["file_scraped"])
df_verified = pd.read_csv(config["data"]["raw"]["file_verified"])
df_vulnerable = pd.read_csv(config["data"]["raw"]["file_vulnerable"])
df_scraped.head()

Unnamed: 0,name,id,publisher,version,description,categories,tags,install_count,rating,last_updated
0,Python,f1f59ae4-9318-4f3c-a9b5-81b2eaa5f8a5,ms-python,2025.11.2025072901,Python language support with extension access ...,Programming Languages;Debuggers;Data Science;M...,__ext_j2;__ext_jinja2;__web_extension;debugger...,177844528,4.192869,2025-07-29T10:50:24.31Z
1,Pylance,364d2426-116a-433a-a5d8-a5098dc3afbd,ms-python,2025.7.100,"A performant, feature-rich language server for...",Programming Languages,__web_extension;json;python,146473811,3.007722,2025-07-30T23:10:58.657Z
2,Jupyter,6c2f1801-1e7f-45b2-9b5c-7782f1e076e8,ms-toolsai,2025.7.2025073101,"Jupyter notebook support, interactive programm...",Extension Packs;Data Science;Machine Learning;...,__ext_jl;__web_extension;debuggers;interactive...,93931981,2.700599,2025-07-31T09:54:06Z
3,C/C++,690b692e-e8a9-493f-b802-8089d50ac1b2,ms-vscode,1.26.3,"C/C++ IntelliSense, debugging, and code browsing.",Programming Languages;Snippets;Linters;Debugge...,C;C++;cpp;cuda-cpp;debuggers;IntelliSense;json...,85413454,3.426871,2025-06-25T19:02:16.243Z
4,Python Debugger,4bd5d2c9-9d65-401a-b0b2-7498d9f17615,ms-python,2025.11.2025072901,Python Debugger extension using debugpy.,Debuggers,debugger;debuggers;debugpy;python,84033338,4.615385,2025-07-29T10:38:32.423Z


In [5]:
# Check dimensions
raw_files = {"scraped": df_scraped, "verified": df_verified, "vulnerable": df_vulnerable}
for key, val in raw_files.items():
        print (f"Dimension of file '{key}': {val.shape}")


Dimension of file 'scraped': (98300, 10)
Dimension of file 'verified': (51000, 5)
Dimension of file 'vulnerable': (33052, 12)


In [6]:
# Check total null values
for key, val in raw_files.items():
        print (f"Null values of '{key}': {val.isna().sum()}")
        print()

Null values of 'scraped': name                 0
id                   0
publisher            1
version              0
description      11993
categories           0
tags             31388
install_count        0
rating               0
last_updated         0
dtype: int64

Null values of 'verified': Extension Name        0
Publisher             0
Verified              0
Install Count         0
Source Code       11025
dtype: int64

Null values of 'vulnerable': Extension Name                      5
Repository Link                     5
Repository Name                     0
Total Vulnerabilities               0
Critical                            0
High                                0
Medium                              0
Low                                 0
Critical Vulnerability Names    33052
High Vulnerability Names        32466
Medium Vulnerability Names      31388
Low Vulnerability Names         31155
dtype: int64



In [7]:
# Check total duplicates
for key, val in raw_files.items():
    print(f"{key}")
    for col in val.columns:
        print (f"Duplicated in column '{col}' of '{key}': {val[col].duplicated().sum()}")
    print()    

scraped
Duplicated in column 'name' of 'scraped': 18068
Duplicated in column 'id' of 'scraped': 16273
Duplicated in column 'publisher' of 'scraped': 39541
Duplicated in column 'version' of 'scraped': 95275
Duplicated in column 'description' of 'scraped': 29244
Duplicated in column 'categories' of 'scraped': 97509
Duplicated in column 'tags' of 'scraped': 66376
Duplicated in column 'install_count' of 'scraped': 83061
Duplicated in column 'rating' of 'scraped': 97486
Duplicated in column 'last_updated' of 'scraped': 16275

verified
Duplicated in column 'Extension Name' of 'verified': 1591
Duplicated in column 'Publisher' of 'verified': 12540
Duplicated in column 'Verified' of 'verified': 50998
Duplicated in column 'Install Count' of 'verified': 38880
Duplicated in column 'Source Code' of 'verified': 12916

vulnerable
Duplicated in column 'Extension Name' of 'vulnerable': 1333
Duplicated in column 'Repository Link' of 'vulnerable': 917
Duplicated in column 'Repository Name' of 'vulnerable

In [8]:
#Check total unique values
for key, val in raw_files.items():
    print(f"{key}")
    for col in val.columns:
        print (f"Total unique values in '{col}' of '{key}': {val[col].nunique()}")
    print()    

scraped
Total unique values in 'name' of 'scraped': 80232
Total unique values in 'id' of 'scraped': 82027
Total unique values in 'publisher' of 'scraped': 58758
Total unique values in 'version' of 'scraped': 3025
Total unique values in 'description' of 'scraped': 69055
Total unique values in 'categories' of 'scraped': 791
Total unique values in 'tags' of 'scraped': 31923
Total unique values in 'install_count' of 'scraped': 15239
Total unique values in 'rating' of 'scraped': 814
Total unique values in 'last_updated' of 'scraped': 82025

verified
Total unique values in 'Extension Name' of 'verified': 49409
Total unique values in 'Publisher' of 'verified': 38460
Total unique values in 'Verified' of 'verified': 2
Total unique values in 'Install Count' of 'verified': 12120
Total unique values in 'Source Code' of 'verified': 38083

vulnerable
Total unique values in 'Extension Name' of 'vulnerable': 31718
Total unique values in 'Repository Link' of 'vulnerable': 32134
Total unique values in '

### 2. Cleaning

#### Clean column names and remove columns

In [9]:
raw_files = {"scraped": df_scraped, "verified": df_verified, "vulnerable": df_vulnerable}
for val in raw_files.values():
    val.columns = val.columns.str.strip().str.lower().str.replace(" ", "_")

In [10]:
df_scraped = df_scraped.drop(["id", "description"], axis = 1).add_prefix("ext_") #avoid overfitting for model training, add prefix to cols
df_verified = df_verified.drop(["install_count"],axis = 1).rename(columns = {"extension_name":"ext_name", "publisher": "repo_publisher", "source_code":"repository"}) # install_count of extension in this df not updated
df_vulnerable = df_vulnerable.drop(["repository_name","critical_vulnerability_names", "high_vulnerability_names", "medium_vulnerability_names", "low_vulnerability_names"],axis = 1).rename(columns = {"extension_name": "ext_name", "repository_link":"repository"})


In [11]:
df_scraped.columns, df_verified.columns, df_vulnerable.columns

(Index(['ext_name', 'ext_publisher', 'ext_version', 'ext_categories',
        'ext_tags', 'ext_install_count', 'ext_rating', 'ext_last_updated'],
       dtype='object'),
 Index(['ext_name', 'repo_publisher', 'verified', 'repository'], dtype='object'),
 Index(['ext_name', 'repository', 'total_vulnerabilities', 'critical', 'high',
        'medium', 'low'],
       dtype='object'))

In [12]:
# Define some dicts for cleaning

duplicated_subsets = {"scraped": ["ext_name","ext_publisher","ext_version","ext_last_updated"],
                      "verified":["ext_name","repo_publisher","repository"],
                      "vulnerable": ["ext_name","repository"],
                     }

null_subsets = {"scraped": ["ext_publisher"],
                "verified": [],
                "vulnerable": ["ext_name"],
               }

#### Drop duplicated after scraping

In [13]:
# Check total duplicates
# Check dimensions
raw_files = {"scraped": df_scraped, "verified": df_verified, "vulnerable": df_vulnerable}
for key, val in raw_files.items():
    print(f"{key}")
    for col in val.columns:
        print (f"Duplicated in column '{col}' of '{key}': {val[col].duplicated().sum()}")
    print()    

scraped
Duplicated in column 'ext_name' of 'scraped': 18068
Duplicated in column 'ext_publisher' of 'scraped': 39541
Duplicated in column 'ext_version' of 'scraped': 95275
Duplicated in column 'ext_categories' of 'scraped': 97509
Duplicated in column 'ext_tags' of 'scraped': 66376
Duplicated in column 'ext_install_count' of 'scraped': 83061
Duplicated in column 'ext_rating' of 'scraped': 97486
Duplicated in column 'ext_last_updated' of 'scraped': 16275

verified
Duplicated in column 'ext_name' of 'verified': 1591
Duplicated in column 'repo_publisher' of 'verified': 12540
Duplicated in column 'verified' of 'verified': 50998
Duplicated in column 'repository' of 'verified': 12916

vulnerable
Duplicated in column 'ext_name' of 'vulnerable': 1333
Duplicated in column 'repository' of 'vulnerable': 917
Duplicated in column 'total_vulnerabilities' of 'vulnerable': 32948
Duplicated in column 'critical' of 'vulnerable': 33051
Duplicated in column 'high' of 'vulnerable': 33017
Duplicated in colum

In [14]:
df_scraped.drop_duplicates(subset=duplicated_subsets["scraped"])

Unnamed: 0,ext_name,ext_publisher,ext_version,ext_categories,ext_tags,ext_install_count,ext_rating,ext_last_updated
0,Python,ms-python,2025.11.2025072901,Programming Languages;Debuggers;Data Science;M...,__ext_j2;__ext_jinja2;__web_extension;debugger...,177844528,4.192869,2025-07-29T10:50:24.31Z
1,Pylance,ms-python,2025.7.100,Programming Languages,__web_extension;json;python,146473811,3.007722,2025-07-30T23:10:58.657Z
2,Jupyter,ms-toolsai,2025.7.2025073101,Extension Packs;Data Science;Machine Learning;...,__ext_jl;__web_extension;debuggers;interactive...,93931981,2.700599,2025-07-31T09:54:06Z
3,C/C++,ms-vscode,1.26.3,Programming Languages;Snippets;Linters;Debugge...,C;C++;cpp;cuda-cpp;debuggers;IntelliSense;json...,85413454,3.426871,2025-06-25T19:02:16.243Z
4,Python Debugger,ms-python,2025.11.2025072901,Debuggers,debugger;debuggers;debugpy;python,84033338,4.615385,2025-07-29T10:38:32.423Z
...,...,...,...,...,...,...,...,...
98292,𝐖𝐀𝐓𝐂𝐇 𝐓𝐡𝐞 𝐁𝐚𝐭𝐦𝐚𝐧 𝐌𝐨𝐯𝐢𝐞 𝐆𝐨𝐨𝐠𝐥𝐞𝐃𝐫𝐢𝐯𝐞-𝐇𝐃,TheB-a-t-m-a-nSubEnglish,0.0.1,Other,__web_extension,0,0.000000,2022-04-11T17:10:47.08Z
98293,City Car Driving 125 Audio Dll High Quality,fuserpugzo,0.0.1,Other,,0,0.000000,2022-03-15T19:02:37.72Z
98295,Jurassic World: Il Dominio (2022) Streaming It...,add,0.9.9,Themes,__web_extension;color-theme;theme,0,0.000000,2022-04-07T15:09:15.58Z
98296,PortableAutodeskAutoCAD2010 fillory,nahavox,0.0.1,Other,,0,0.000000,2022-03-23T08:15:17.353Z


In [15]:
df_scraped[df_scraped["ext_name"] == "C/C++ Extension Pack"]

Unnamed: 0,ext_name,ext_publisher,ext_version,ext_categories,ext_tags,ext_install_count,ext_rating,ext_last_updated
18,C/C++ Extension Pack,ms-vscode,1.3.1,Programming Languages;Extension Packs,build;C;C++;cmake;color-theme;IntelliSense;Mic...,44366019,4.613636,2025-02-24T18:58:04.24Z
1345,C/C++ Extension Pack,LeoJhonSong,2.1.0,Extension Packs,__web_extension;c++,197606,5.0,2022-08-31T07:02:25.517Z
2466,C/C++ Extension Pack,franneck94,0.10.0,Extension Packs,__web_extension;c;c++;cpp;extension pack,78286,5.0,2022-11-29T06:54:18.99Z
3338,C/C++ Extension Pack,Kr4is,1.3.2,Extension Packs,__web_extension;AI;autocomplete;build;C;C++;cl...,48662,4.0,2024-10-21T09:54:27.78Z
5117,C/C++ Extension Pack,mischelebuha,0.0.3,Extension Packs,__web_extension,24118,0.0,2020-04-24T10:21:04.5Z
9581,C/C++ Extension Pack,hakula,1.0.3,Extension Packs,__web_extension;c++,8073,0.0,2025-06-21T20:19:17.887Z


In [16]:
df_verified[df_verified["ext_name"] == "C/C++ Extension Pack"]

Unnamed: 0,ext_name,repo_publisher,verified,repository
28,C/C++ Extension Pack,Microsoft,True,https://github.com/microsoft/vscode-cpptools.git
1183,C/C++ Extension Pack,LeoJhonSong,False,https://github.com/LeoJhonSong/Cpp-Extension-P...
3204,C/C++ Extension Pack,franneck94,False,https://github.com/franneck94/vscode-c-cpp-ext...
3919,C/C++ Extension Pack,Kr4is,False,https://github.com/Kr4is/cpptools-extension-pa...
4205,C/C++ Extension Pack,mischelebuha,False,https://github.com/mischelebuha/c-cpp-extensio...
13215,C/C++ Extension Pack,Hakula Chen,False,https://github.com/hakula139/cpp-extension-pac...


In [17]:
df_vulnerable[df_vulnerable["ext_name"] == "C/C++ Extension Pack"]

Unnamed: 0,ext_name,repository,total_vulnerabilities,critical,high,medium,low
6699,C/C++ Extension Pack,https://github.com/Kr4is/cpptools-extension-pa...,0,0,0,0,0
8676,C/C++ Extension Pack,https://github.com/mischelebuha/c-cpp-extensio...,0,0,0,0,0
13759,C/C++ Extension Pack,https://github.com/LeoJhonSong/Cpp-Extension-P...,0,0,0,0,0
16766,C/C++ Extension Pack,https://github.com/franneck94/vscode-c-cpp-ext...,0,0,0,0,0
19711,C/C++ Extension Pack,https://github.com/LeoJhonSong/Cpp-Extension-P...,0,0,0,0,0


In [18]:
# df_scraped[df_scraped.duplicated(subset=["name","publisher","version"])]
# df_verified[df_verified.duplicated(subset=["Extension Name","Publisher","Install Count"])]
# df_vulnerable[df_vulnerable.duplicated(subset=["Extension Name","Repository Link"])]
df_scraped = df_scraped.drop_duplicates(subset=duplicated_subsets["scraped"])
df_verified = df_verified.drop_duplicates(subset=duplicated_subsets["verified"])
df_vulnerable = df_vulnerable.drop_duplicates(subset=duplicated_subsets["vulnerable"])

In [19]:
# Check total duplicates
# Check dimensions
raw_files = {"scraped": df_scraped, "verified": df_verified, "vulnerable": df_vulnerable}
for key, val in raw_files.items():
    print(f"{key}")
    for col in val.columns:
        print (f"Duplicated in column '{col}' of '{key}': {val[col].duplicated().sum()}")
    print()
    print(f"Duplicated in '{key}' with subset '{duplicated_subsets[key]}': {val.duplicated(subset= duplicated_subsets[key]).sum()} ")
    print() 

scraped
Duplicated in column 'ext_name' of 'scraped': 1795
Duplicated in column 'ext_publisher' of 'scraped': 23268
Duplicated in column 'ext_version' of 'scraped': 79002
Duplicated in column 'ext_categories' of 'scraped': 81236
Duplicated in column 'ext_tags' of 'scraped': 50103
Duplicated in column 'ext_install_count' of 'scraped': 66800
Duplicated in column 'ext_rating' of 'scraped': 81213
Duplicated in column 'ext_last_updated' of 'scraped': 2

Duplicated in 'scraped' with subset '['ext_name', 'ext_publisher', 'ext_version', 'ext_last_updated']': 0 

verified
Duplicated in column 'ext_name' of 'verified': 1515
Duplicated in column 'repo_publisher' of 'verified': 12464
Duplicated in column 'verified' of 'verified': 50922
Duplicated in column 'repository' of 'verified': 12840

Duplicated in 'verified' with subset '['ext_name', 'repo_publisher', 'repository']': 0 

vulnerable
Duplicated in column 'ext_name' of 'vulnerable': 416
Duplicated in column 'repository' of 'vulnerable': 0
Dupl

#### Check null values and drop null

In [20]:
# Check dimensions
raw_files = {"scraped": df_scraped, "verified": df_verified, "vulnerable": df_vulnerable}
for key, val in raw_files.items():
        print (f"Dimension of file '{key}': {val.shape}")

Dimension of file 'scraped': (82027, 8)
Dimension of file 'verified': (50924, 4)
Dimension of file 'vulnerable': (32135, 7)


In [21]:
# Check total null values
raw_files = {"scraped": df_scraped, "verified": df_verified, "vulnerable": df_vulnerable}
for key, val in raw_files.items():
        print (f"Null values of '{key}': {val.isna().sum()}")
        print()

Null values of 'scraped': ext_name                 0
ext_publisher            1
ext_version              0
ext_categories           0
ext_tags             24539
ext_install_count        0
ext_rating               0
ext_last_updated         0
dtype: int64

Null values of 'verified': ext_name              0
repo_publisher        0
verified              0
repository        10992
dtype: int64

Null values of 'vulnerable': ext_name                 1
repository               1
total_vulnerabilities    0
critical                 0
high                     0
medium                   0
low                      0
dtype: int64



In [22]:
df_scraped = df_scraped.dropna(subset=null_subsets["scraped"], how="all")

Not to drop null values in repository.It can be derived by ext_name, repo_publisher. Will look after it after merging  (to be done) 

In [23]:
# df_verified[~((df_verified["repository"].str.contains("https://git|https://www.git|github.com",na=False,regex=True)) | (df_verified["repository"].isna()))]

In [24]:
# df_vulnerable = df_vulnerable[~df_vulnerable["ext_name"].isna()]
df_vulnerable = df_vulnerable.dropna(subset= null_subsets["vulnerable"], how= "all")

In [25]:
df_vulnerable.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32134 entries, 0 to 33051
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   ext_name               32134 non-null  object
 1   repository             32134 non-null  object
 2   total_vulnerabilities  32134 non-null  int64 
 3   critical               32134 non-null  int64 
 4   high                   32134 non-null  int64 
 5   medium                 32134 non-null  int64 
 6   low                    32134 non-null  int64 
dtypes: int64(5), object(2)
memory usage: 2.0+ MB


Map value in ext_publisher vs repo_publisher

In [26]:
df_scraped["ext_publisher"].value_counts()

ext_publisher
gerane            285
ms-vscode         114
buildwithlayer     90
ctf0               60
simonhe            54
                 ... 
Aliah16             1
trevor              1
InFeRnA             1
TestPublisher       1
titi                1
Name: count, Length: 58758, dtype: int64

In [27]:
df_verified["repo_publisher"].value_counts()

repo_publisher
gerane            286
Microsoft         247
ctf0               60
selfrefactor       59
Sandip Chitale     53
                 ... 
Shaun Kulesa        1
yl                  1
chai2010            1
ITCLOUD             1
eitt                1
Name: count, Length: 38460, dtype: int64

In [28]:
df_scraped["merge_publisher"] = df_scraped["ext_publisher"].apply(clean_ext_publisher)
df_verified["merge_publisher"] = df_verified["repo_publisher"].apply(clean_repo_publisher)

In [29]:
df_scraped["ext_publisher"].nunique()

58758

#### Combine raw df after cleaning 

In [30]:
df_scraped.head()

Unnamed: 0,ext_name,ext_publisher,ext_version,ext_categories,ext_tags,ext_install_count,ext_rating,ext_last_updated,merge_publisher
0,Python,ms-python,2025.11.2025072901,Programming Languages;Debuggers;Data Science;M...,__ext_j2;__ext_jinja2;__web_extension;debugger...,177844528,4.192869,2025-07-29T10:50:24.31Z,microsoft
1,Pylance,ms-python,2025.7.100,Programming Languages,__web_extension;json;python,146473811,3.007722,2025-07-30T23:10:58.657Z,microsoft
2,Jupyter,ms-toolsai,2025.7.2025073101,Extension Packs;Data Science;Machine Learning;...,__ext_jl;__web_extension;debuggers;interactive...,93931981,2.700599,2025-07-31T09:54:06Z,microsoft
3,C/C++,ms-vscode,1.26.3,Programming Languages;Snippets;Linters;Debugge...,C;C++;cpp;cuda-cpp;debuggers;IntelliSense;json...,85413454,3.426871,2025-06-25T19:02:16.243Z,microsoft
4,Python Debugger,ms-python,2025.11.2025072901,Debuggers,debugger;debuggers;debugpy;python,84033338,4.615385,2025-07-29T10:38:32.423Z,microsoft


In [31]:
df_verified.head()

Unnamed: 0,ext_name,repo_publisher,verified,repository,merge_publisher
0,Python,Microsoft,True,https://github.com/Microsoft/vscode-python.git,microsoft
1,Jupyter,Microsoft,True,https://github.com/Microsoft/vscode-jupyter.git,microsoft
2,Pylance,Microsoft,True,https://github.com/microsoft/pylance-release.git,microsoft
3,C/C++,Microsoft,True,https://github.com/Microsoft/vscode-cpptools.git,microsoft
4,Jupyter Keymap,Microsoft,True,https://github.com/Microsoft/vscode-jupyter-ke...,microsoft


In [32]:
df_vulnerable.head()

Unnamed: 0,ext_name,repository,total_vulnerabilities,critical,high,medium,low
0,Perl cpanfile,https://github.com/bayashi/perlcpanfile.git,0,0,0,0,0
1,Lingua Franca,https://github.com/lf-lang/vscode-lingua-franc...,1,0,0,0,1
2,AppEngine Theme,https://github.com/chazeprasad/appengine-theme,0,0,0,0,0
3,Firestore Rules,https://github.com/ChFlick/firecode,0,0,0,0,0
4,Comment Labels,https://github.com/jamespgilbert/comment-label...,0,0,0,0,0


In [33]:
# df_scraped = df_scraped[df_scraped["ext_rating"]>0]
df_ver_vul = pd.merge(df_verified,df_vulnerable, on= ["ext_name", "repository"], how="left")


In [34]:
df_ver_vul.shape

(50924, 10)

In [35]:
# df_verified[df_verified["ext_name"] == "C/C++ Extension Pack"]
# df_vulnerable[df_vulnerable["ext_name"] == "C/C++ Extension Pack"]
# df_ver_vul[df_ver_vul["ext_name"] == "C/C++ Extension Pack"]

In [36]:
df_clean = pd.merge(df_scraped, df_ver_vul, left_on=["ext_name","merge_publisher"], right_on=["ext_name","merge_publisher"], how= "inner")

In [37]:
df_clean

Unnamed: 0,ext_name,ext_publisher,ext_version,ext_categories,ext_tags,ext_install_count,ext_rating,ext_last_updated,merge_publisher,repo_publisher,verified,repository,total_vulnerabilities,critical,high,medium,low
0,Python,ms-python,2025.11.2025072901,Programming Languages;Debuggers;Data Science;M...,__ext_j2;__ext_jinja2;__web_extension;debugger...,177844528,4.192869,2025-07-29T10:50:24.31Z,microsoft,Microsoft,True,https://github.com/Microsoft/vscode-python.git,0.0,0.0,0.0,0.0,0.0
1,Pylance,ms-python,2025.7.100,Programming Languages,__web_extension;json;python,146473811,3.007722,2025-07-30T23:10:58.657Z,microsoft,Microsoft,True,https://github.com/microsoft/pylance-release.git,0.0,0.0,0.0,0.0,0.0
2,Jupyter,ms-toolsai,2025.7.2025073101,Extension Packs;Data Science;Machine Learning;...,__ext_jl;__web_extension;debuggers;interactive...,93931981,2.700599,2025-07-31T09:54:06Z,microsoft,Microsoft,True,https://github.com/Microsoft/vscode-jupyter.git,0.0,0.0,0.0,0.0,0.0
3,C/C++,ms-vscode,1.26.3,Programming Languages;Snippets;Linters;Debugge...,C;C++;cpp;cuda-cpp;debuggers;IntelliSense;json...,85413454,3.426871,2025-06-25T19:02:16.243Z,microsoft,Microsoft,True,https://github.com/Microsoft/vscode-cpptools.git,11.0,0.0,0.0,0.0,11.0
4,Jupyter Keymap,ms-toolsai,1.1.2,Notebooks,__web_extension;keybindings;notebook-keymap,74084334,4.000000,2023-06-05T17:53:31.993Z,microsoft,Microsoft,True,https://github.com/Microsoft/vscode-jupyter-ke...,17.0,0.0,0.0,9.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33162,otto,OTTO,0.0.4,Programming Languages;Snippets,ai;autocomplete;documentation;javascript;ml;py...,2,0.000000,2020-07-27T19:15:40.797Z,otto,OTTO,False,https://github.com/github.com/otto-ai/otto-vscode,,,,,
33163,Onboardbase,Onboardbase,1.0.2,Other,onboardbase;secrets,1,0.000000,2022-10-29T11:09:49.04Z,onboardbase,Onboardbase,True,https://github.com/Onboardbase/onboardbase-vsc...,1.0,0.0,1.0,0.0,0.0
33164,MeOcean Themes,JaianeOliveira,0.0.3,Themes,__web_extension;color-theme;theme,1,0.000000,2023-01-17T03:29:03.003Z,jaianeoliveira,Jaiane Oliveira,False,https://github.com/JaianeOliveira/meocean-them...,0.0,0.0,0.0,0.0,0.0
33165,ROS,ms-riot,0.4.2,Debuggers,__ext_launch;__ext_rviz;__ext_srdf;__ext_test;...,0,0.000000,2019-05-16T23:49:41.96Z,microsoft,Microsoft,True,https://github.com/ms-iot/vscode-ros.git,4.0,0.0,0.0,0.0,4.0


In [38]:
# df_scraped[df_scraped ["ext_name"] == "C/C++ Extension Pack"]
# df_ver_vul[df_ver_vul ["ext_name"] == "C/C++ Extension Pack"]
# df_clean[df_clean ["ext_name"] == "C/C++ Extension Pack"]

#### Clean Git Url

In [39]:
df_clean = df_clean[((df_clean["repository"].str.contains("https://git|https://www.git|github.com",na=False,regex=True)) | (df_clean["repository"].isna()))]
df_clean["repository"] = df_clean["repository"].apply(split_url)
df_clean.to_csv(config["data"]["clean"]["file_cleaned"],index=False )

In [40]:
#
df_clean[((df_clean["repository"].str.contains("https://git|https://www.git|github.com",na=False,regex=True)))]

Unnamed: 0,ext_name,ext_publisher,ext_version,ext_categories,ext_tags,ext_install_count,ext_rating,ext_last_updated,merge_publisher,repo_publisher,verified,repository,total_vulnerabilities,critical,high,medium,low
0,Python,ms-python,2025.11.2025072901,Programming Languages;Debuggers;Data Science;M...,__ext_j2;__ext_jinja2;__web_extension;debugger...,177844528,4.192869,2025-07-29T10:50:24.31Z,microsoft,Microsoft,True,https://github.com/Microsoft/vscode-python,0.0,0.0,0.0,0.0,0.0
1,Pylance,ms-python,2025.7.100,Programming Languages,__web_extension;json;python,146473811,3.007722,2025-07-30T23:10:58.657Z,microsoft,Microsoft,True,https://github.com/microsoft/pylance-release,0.0,0.0,0.0,0.0,0.0
2,Jupyter,ms-toolsai,2025.7.2025073101,Extension Packs;Data Science;Machine Learning;...,__ext_jl;__web_extension;debuggers;interactive...,93931981,2.700599,2025-07-31T09:54:06Z,microsoft,Microsoft,True,https://github.com/Microsoft/vscode-jupyter,0.0,0.0,0.0,0.0,0.0
3,C/C++,ms-vscode,1.26.3,Programming Languages;Snippets;Linters;Debugge...,C;C++;cpp;cuda-cpp;debuggers;IntelliSense;json...,85413454,3.426871,2025-06-25T19:02:16.243Z,microsoft,Microsoft,True,https://github.com/Microsoft/vscode-cpptools,11.0,0.0,0.0,0.0,11.0
4,Jupyter Keymap,ms-toolsai,1.1.2,Notebooks,__web_extension;keybindings;notebook-keymap,74084334,4.000000,2023-06-05T17:53:31.993Z,microsoft,Microsoft,True,https://github.com/Microsoft/vscode-jupyter-ke...,17.0,0.0,0.0,9.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33162,otto,OTTO,0.0.4,Programming Languages;Snippets,ai;autocomplete;documentation;javascript;ml;py...,2,0.000000,2020-07-27T19:15:40.797Z,otto,OTTO,False,https://github.com/github.com/otto-ai/otto-vscode,,,,,
33163,Onboardbase,Onboardbase,1.0.2,Other,onboardbase;secrets,1,0.000000,2022-10-29T11:09:49.04Z,onboardbase,Onboardbase,True,https://github.com/Onboardbase/onboardbase-vscode,1.0,0.0,1.0,0.0,0.0
33164,MeOcean Themes,JaianeOliveira,0.0.3,Themes,__web_extension;color-theme;theme,1,0.000000,2023-01-17T03:29:03.003Z,jaianeoliveira,Jaiane Oliveira,False,https://github.com/JaianeOliveira/meocean-themes,0.0,0.0,0.0,0.0,0.0
33165,ROS,ms-riot,0.4.2,Debuggers,__ext_launch;__ext_rviz;__ext_srdf;__ext_test;...,0,0.000000,2019-05-16T23:49:41.96Z,microsoft,Microsoft,True,https://github.com/ms-iot/vscode-ros,4.0,0.0,0.0,0.0,4.0


In [41]:
df_clean[df_clean["repository"].isna()]

Unnamed: 0,ext_name,ext_publisher,ext_version,ext_categories,ext_tags,ext_install_count,ext_rating,ext_last_updated,merge_publisher,repo_publisher,verified,repository,total_vulnerabilities,critical,high,medium,low
13,GitHub Copilot,GitHub,1.351.1711,Programming Languages;Machine Learning;AI;Chat,__web_extension;ai;autocomplete;c#;c++;code-re...,45402843,4.232344,2025-08-01T01:51:50.983Z,github,GitHub,True,,,,,,
45,C# Dev Kit,ms-dotnettools,1.41.5,Programming Languages;Linters;Debuggers;Testing,asp.net;c#;csharp;devkit;dotnet;json;keybindings,10588793,2.803279,2025-07-24T18:16:32.787Z,microsoft,Microsoft,True,,,,,,
65,GitHub Codespaces,GitHub,1.17.3,Other,__web_extension;continueOn;json;jsonc;remote-menu,6216269,4.800000,2024-09-11T15:02:06.743Z,github,GitHub,True,,,,,,
88,Auto Import,steoates,1.5.4,Other,auto import;imports;multi-root ready;require;t...,4919930,4.194915,2021-04-15T15:34:51.077Z,steoates,steoates,False,,,,,,
105,IntelliCode for C# Dev Kit,ms-dotnettools,2.2.3,Programming Languages,c#;csharp,3977553,2.200000,2024-11-13T06:10:01.273Z,microsoft,Microsoft,True,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33147,dianyuluo-vscode-pack,dianyuluo,0.0.7,Other,,5,0.000000,2023-09-01T09:35:43.87Z,dianyuluo,dianyuluo,False,,,,,,
33149,MurageExtension,MurageKibicho-Trial,0.0.1,Other,snippet,5,0.000000,2023-07-10T09:39:06.72Z,muragekibicho-trial,Murage Kibicho-Trial,False,,,,,,
33151,Salih's Hello World,LeroySalih,0.0.1,Other,,5,0.000000,2023-08-15T17:46:40.007Z,leroysalih,Leroy Salih,False,,,,,,
33152,Code Valet,sophodoros,0.6.4,Formatters,keybindings,4,0.000000,2016-11-10T01:06:17.103Z,sophodoros,Sophodoros,False,,,,,,


In [42]:
df_raw = pd.read_csv(config["data"]["raw"]["file_ext_repo"])
df_raw.shape

(24396, 23)