In [85]:
# Data handling
import numpy as np
import pandas as pd
import re

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker
import graphviz
import optuna
import optuna.visualization as vis
%matplotlib inline

# Stats
from statsmodels.graphics.gofplots import qqplot
import statsmodels.api as sm
import scipy.stats as st
from scipy.stats import shapiro, norm, chi2_contingency

# Preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Models
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor,KNeighborsClassifier
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor,AdaBoostRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor, export_graphviz

# Metrics
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, root_mean_squared_error

#lib
from lib.clean_data_functions import clean_ext_version, clean_ext_publisher, clean_repo_publisher

#
from wordcloud import WordCloud,STOPWORDS
from ast import literal_eval
from collections import Counter

# os
import os

# time
import time


In [86]:
import yaml

try:
    with open("../config.yaml", "r") as file:
        config = yaml.safe_load(file)
except:
    print("Yaml configuration file not found!")

### 1. Loading

In [87]:
df = pd.read_csv(config["data"]["raw"]["file_ext_repo"])
# df = df.sort_values(by = ["ext_install_count", "ext_rating"], ascending= False)
df.head()

Unnamed: 0,verified,ext_name,ext_publisher,ext_version,ext_categories,ext_tags,ext_install_count,ext_rating,ext_last_updated,repo_publisher,...,high,medium,low,repo_owner,repo_name,repo_stars,repo_forks,language,topics,error
0,True,Python,ms-python,2025.11.2025072901,Programming Languages;Debuggers;Data Science;M...,__ext_j2;__ext_jinja2;__web_extension;debugger...,177844528,4.192869,2025-07-29T10:50:24.31Z,Microsoft,...,0,0,0,Microsoft,vscode-python,4461,1247,[],[],
1,True,Pylance,ms-python,2025.7.100,Programming Languages,__web_extension;json;python,146473811,3.007722,2025-07-30T23:10:58.657Z,Microsoft,...,0,0,0,microsoft,pylance-release,1881,856,"[""Python"",""Jupyter Notebook""]","[""python"",""language-server"",""code-analysis"",""l...",
2,True,Jupyter,ms-toolsai,2025.7.2025073101,Extension Packs;Data Science;Machine Learning;...,__ext_jl;__web_extension;debuggers;interactive...,93931981,2.700599,2025-07-31T09:54:06Z,Microsoft,...,0,0,0,Microsoft,vscode-jupyter,1401,337,"[""TypeScript"",""Python"",""Jupyter Notebook"",""Jav...","[""machine-learning"",""jupyter"",""vscode"",""datasc...",
3,True,C/C++,ms-vscode,1.26.3,Programming Languages;Snippets;Linters;Debugge...,C;C++;cpp;cuda-cpp;debuggers;IntelliSense;json...,85413454,3.426871,2025-06-25T19:02:16.243Z,Microsoft,...,0,0,11,Microsoft,vscode-cpptools,5871,1633,"[""TypeScript"",""C++"",""HTML"",""JavaScript""]","[""microsoft"",""typescript"",""vscode-extension""]",
4,True,Jupyter Keymap,ms-toolsai,1.1.2,Notebooks,__web_extension;keybindings;notebook-keymap,74084334,4.0,2023-06-05T17:53:31.993Z,Microsoft,...,0,9,8,Microsoft,vscode-jupyter-keymap,32,14,[],[],


In [88]:
print(f'The dataset has {df.shape[0]} rows and {df.shape[1]} columns')

The dataset has 24396 rows and 23 columns


In [89]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24396 entries, 0 to 24395
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   verified               24396 non-null  bool   
 1   ext_name               24396 non-null  object 
 2   ext_publisher          24396 non-null  object 
 3   ext_version            24396 non-null  object 
 4   ext_categories         24396 non-null  object 
 5   ext_tags               20469 non-null  object 
 6   ext_install_count      24396 non-null  int64  
 7   ext_rating             24396 non-null  float64
 8   ext_last_updated       24396 non-null  object 
 9   repo_publisher         24396 non-null  object 
 10  repository             24396 non-null  object 
 11  total_vulnerabilities  24396 non-null  int64  
 12  critical               24396 non-null  int64  
 13  high                   24396 non-null  int64  
 14  medium                 24396 non-null  int64  
 15  lo

#### Metadata:
- **_verified_:**                 check, whether extension's security is breached         (boolean)
- **_ext_name_:**                 name of extension                                       (obj)
- **_ext_publisher_:**            name of extension'S publisher                           (obj)
- **_ext_version_:**              current version of extension                            (obj)
- **_ext_categories_:**           categories of extension                                 (obj)   (multi values)
- **_ext_tags_:**                 keywords related to extension                           (obj)   (multi values)
- **_ext\_install\_count_:**      total number of installations of extension              (int64)
- **_ext\_rating_:**              rating of extension (avg of stars rating)               (float64)
- **_ext\_last\_updated_:**       timestamp of last update                                (obj)
- **_repo_publisher_:**           publisher of extension                                  (obj)
- **_repository_:**               url of repository                                       (obj)
- **_total_vulnerabilities_:**    number of detected vulnerabilities                      (int64)
- **_critical_:**                 number of critical(severity) vulnerabilities            (int64)
- **_high_:**                     number of high(severity) vulnerabilities                (int64)
- **_medium_:**                   number of medium(severity) vulnerabilities              (int64)
- **_low_:**                      number of low(severity) vulnerabilities                 (int64)
- **_repo\_owner_:**              owner of repository (via column repository)             (obj)
- **_repo\_name_:**               name of repository (via column repository)              (obj)
- **_repo\_stars_:**              number of stars of repository (via column repository)   (int64)   
- **_repo\_forks_:**              number of forks of repository (via column repository)   (int64)   
- **_language_:**                 program languages used (via column repository)          (obj)   (multi values)
- **_topics_:**                   keywords related to repository (via column repository)  (obj)   (multi values)
- **_error_:**                    log of fetching repository                              (obj)


### 2. Cleaning

In [90]:
# Check missing values
df.isna().sum()

verified                     0
ext_name                     0
ext_publisher                0
ext_version                  0
ext_categories               0
ext_tags                  3927
ext_install_count            0
ext_rating                   0
ext_last_updated             0
repo_publisher               0
repository                   0
total_vulnerabilities        0
critical                     0
high                         0
medium                       0
low                          0
repo_owner                   0
repo_name                    0
repo_stars                   0
repo_forks                   0
language                     0
topics                       0
error                    21601
dtype: int64

Remove record with values (not null) in "error" column due to unavailability of repository

In [91]:
df = df[df["error"].isna()]
print(f"After removing extensions with unavailable repository, the dataset has {df.shape[0]} rows and {df.shape[1]} columns")

After removing extensions with unavailable repository, the dataset has 21601 rows and 23 columns


#### Clean columns

Rename and remove columns for noises and avoiding of overfit ["ext\_tags", "repo\_publisher", "error"]

In [92]:
# Use  "repo\_owner" over "repo\_publisher" due to correctness from fetching infos directly)
df = df.drop(columns=["ext_name","ext_publisher", "ext_last_updated", "repo_publisher", "error"]).rename(columns={"total_vulnerabilities": "total_vulners", "critical": "critical__vulners", "high": "high__vulners", "medium": "medium__vulners", "low": "low__vulners", "language": "repo_languages", "topics": "repo_topics"})

In [93]:
df

Unnamed: 0,verified,ext_version,ext_categories,ext_tags,ext_install_count,ext_rating,repository,total_vulners,critical__vulners,high__vulners,medium__vulners,low__vulners,repo_owner,repo_name,repo_stars,repo_forks,repo_languages,repo_topics
0,True,2025.11.2025072901,Programming Languages;Debuggers;Data Science;M...,__ext_j2;__ext_jinja2;__web_extension;debugger...,177844528,4.192869,https://github.com/Microsoft/vscode-python,0,0,0,0,0,Microsoft,vscode-python,4461,1247,[],[]
1,True,2025.7.100,Programming Languages,__web_extension;json;python,146473811,3.007722,https://github.com/microsoft/pylance-release,0,0,0,0,0,microsoft,pylance-release,1881,856,"[""Python"",""Jupyter Notebook""]","[""python"",""language-server"",""code-analysis"",""l..."
2,True,2025.7.2025073101,Extension Packs;Data Science;Machine Learning;...,__ext_jl;__web_extension;debuggers;interactive...,93931981,2.700599,https://github.com/Microsoft/vscode-jupyter,0,0,0,0,0,Microsoft,vscode-jupyter,1401,337,"[""TypeScript"",""Python"",""Jupyter Notebook"",""Jav...","[""machine-learning"",""jupyter"",""vscode"",""datasc..."
3,True,1.26.3,Programming Languages;Snippets;Linters;Debugge...,C;C++;cpp;cuda-cpp;debuggers;IntelliSense;json...,85413454,3.426871,https://github.com/Microsoft/vscode-cpptools,11,0,0,0,11,Microsoft,vscode-cpptools,5871,1633,"[""TypeScript"",""C++"",""HTML"",""JavaScript""]","[""microsoft"",""typescript"",""vscode-extension""]"
4,True,1.1.2,Notebooks,__web_extension;keybindings;notebook-keymap,74084334,4.000000,https://github.com/Microsoft/vscode-jupyter-ke...,17,0,0,9,8,Microsoft,vscode-jupyter-keymap,32,14,[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24389,False,0.0.1,Themes,__web_extension;color-theme;theme,1,0.000000,https://github.com/finnlinn/uwustyl,0,0,0,0,0,finnlinn,uwustyl,0,0,[],[]
24390,False,1.0.1,Other,mips,2,0.000000,https://github.com/akainth015/marscode,0,0,0,0,0,akainth015,marscode,1,1,"[""TypeScript""]",[]
24392,True,1.0.2,Other,onboardbase;secrets,1,0.000000,https://github.com/Onboardbase/onboardbase-vscode,1,0,1,0,0,Onboardbase,onboardbase-vscode,3,0,"[""TypeScript"",""JavaScript""]",[]
24394,True,0.4.2,Debuggers,__ext_launch;__ext_rviz;__ext_srdf;__ext_test;...,0,0.000000,https://github.com/ms-iot/vscode-ros,4,0,0,0,4,ms-iot,vscode-ros,427,103,[],"[""ros""]"


In [94]:
# df.info()

#### Clean data inconsistencies

In [96]:
#ext_version

df["ext_version"] = df["ext_version"].apply(clean_ext_version)
df['ext_version'] = pd.to_numeric(df['ext_version'], errors="coerce")

In [109]:
df["ext_rating"] = df["ext_rating"].apply(lambda x: round(x,2) if pd.notna(x) else pd.NA )

In [110]:
# df[(df["ext_version"] > 10) & (df["ext_rating"] == 0) & (df["ext_install_count"] < 100) & (df["repo_stars"] < 10)]
# df[(df["ext_rating"] == 0) & (df["ext_install_count"] < 10) & (df["repo_stars"] < 10)]
df

Unnamed: 0,verified,ext_version,ext_categories,ext_tags,ext_install_count,ext_rating,repository,total_vulners,critical__vulners,high__vulners,medium__vulners,low__vulners,repo_owner,repo_name,repo_stars,repo_forks,repo_languages,repo_topics
0,True,2025,Programming Languages;Debuggers;Data Science;M...,__ext_j2;__ext_jinja2;__web_extension;debugger...,177844528,4.19,https://github.com/Microsoft/vscode-python,0,0,0,0,0,Microsoft,vscode-python,4461,1247,Unknown,Unknown
1,True,2025,Programming Languages,__web_extension;json;python,146473811,3.01,https://github.com/microsoft/pylance-release,0,0,0,0,0,microsoft,pylance-release,1881,856,Python;Jupyter Notebook,python;language-server;code-analysis;language-...
2,True,2025,Extension Packs;Data Science;Machine Learning;...,__ext_jl;__web_extension;debuggers;interactive...,93931981,2.70,https://github.com/Microsoft/vscode-jupyter,0,0,0,0,0,Microsoft,vscode-jupyter,1401,337,TypeScript;Python;Jupyter Notebook;JavaScript;...,machine-learning;jupyter;vscode;datascience;vs...
3,True,1,Programming Languages;Snippets;Linters;Debugge...,C;C++;cpp;cuda-cpp;debuggers;IntelliSense;json...,85413454,3.43,https://github.com/Microsoft/vscode-cpptools,11,0,0,0,11,Microsoft,vscode-cpptools,5871,1633,TypeScript;C++;HTML;JavaScript,microsoft;typescript;vscode-extension
4,True,1,Notebooks,__web_extension;keybindings;notebook-keymap,74084334,4.00,https://github.com/Microsoft/vscode-jupyter-ke...,17,0,0,9,8,Microsoft,vscode-jupyter-keymap,32,14,Unknown,Unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24389,False,0,Themes,__web_extension;color-theme;theme,1,0.00,https://github.com/finnlinn/uwustyl,0,0,0,0,0,finnlinn,uwustyl,0,0,Unknown,Unknown
24390,False,1,Other,mips,2,0.00,https://github.com/akainth015/marscode,0,0,0,0,0,akainth015,marscode,1,1,TypeScript,Unknown
24392,True,1,Other,onboardbase;secrets,1,0.00,https://github.com/Onboardbase/onboardbase-vscode,1,0,1,0,0,Onboardbase,onboardbase-vscode,3,0,TypeScript;JavaScript,Unknown
24394,True,0,Debuggers,__ext_launch;__ext_rviz;__ext_srdf;__ext_test;...,0,0.00,https://github.com/ms-iot/vscode-ros,4,0,0,0,4,ms-iot,vscode-ros,427,103,Unknown,ros


In [98]:
#repo_languages
# eval the obj[] and transform to str
df["repo_languages"] = df["repo_languages"].apply(lambda x: literal_eval(x) if pd.notna(x) else [])
df["repo_languages"] = df["repo_languages"].apply(lambda x: ';'.join(map(str, x)) if x else "Unknown")

In [99]:
languages = ""
for lang in df["repo_languages"]:
    languages += lang.lower()
    languages += ";"
languages_list = languages.split(';')
languages_set  = set(languages_list)

prog_languages = languages_set

In [107]:
prog_languages

{'',
 'actionscript',
 'ada',
 'adblock filter list',
 'ags script',
 'al',
 'answer set programming',
 'antlr',
 'apex',
 'applescript',
 'arc',
 'asp.net',
 'assembly',
 'astro',
 'autohotkey',
 'awk',
 'ballerina',
 'basic',
 'batchfile',
 'berry',
 'bicep',
 'bikeshed',
 'bitbake',
 'blade',
 'boogie',
 'brainfuck',
 'brighterscript',
 'c',
 'c#',
 'c++',
 'cap cds',
 "cap'n proto",
 'circom',
 'clarion',
 'clarity',
 'classic asp',
 'clojure',
 'cmake',
 'cobol',
 'codeql',
 'coffeescript',
 'coldfusion',
 'common lisp',
 'crystal',
 'csound score',
 'css',
 'cuda',
 'cycript',
 'cython',
 'd',
 'd2',
 'dafny',
 'dart',
 'daslang',
 'dhall',
 'dockerfile',
 'dogescript',
 'dune',
 'e',
 'earthly',
 'eiffel',
 'ejs',
 'elixir',
 'elm',
 'elvish',
 'emacs lisp',
 'erlang',
 'euphoria',
 'f#',
 'faust',
 'fluent',
 'flux',
 'freebasic',
 'freemarker',
 'g-code',
 'gap',
 'gdscript',
 'gdshader',
 'genero 4gl',
 'gherkin',
 'gleam',
 'glsl',
 'gnuplot',
 'go',
 'groovy',
 'gsc',
 'hac

In [100]:
#repo_topics
df["repo_topics"] = df["repo_topics"].apply(lambda x: literal_eval(x) if pd.notna(x) else [])
df["repo_topics"] = df["repo_topics"].apply(lambda x: ';'.join(map(str, x)) if x else "Unknown")


In [117]:
df[(df["verified"] == True) & (df["repo_languages"] == "Unknown")]

Unnamed: 0,verified,ext_version,ext_categories,ext_tags,ext_install_count,ext_rating,repository,total_vulners,critical__vulners,high__vulners,medium__vulners,low__vulners,repo_owner,repo_name,repo_stars,repo_forks,repo_languages,repo_topics
0,True,2025,Programming Languages;Debuggers;Data Science;M...,__ext_j2;__ext_jinja2;__web_extension;debugger...,177844528,4.19,https://github.com/Microsoft/vscode-python,0,0,0,0,0,Microsoft,vscode-python,4461,1247,Unknown,Unknown
4,True,1,Notebooks,__web_extension;keybindings;notebook-keymap,74084334,4.00,https://github.com/Microsoft/vscode-jupyter-ke...,17,0,0,9,8,Microsoft,vscode-jupyter-keymap,32,14,Unknown,Unknown
10,True,2,Other,__web_extension;compose;container;containers;d...,46671371,4.50,https://github.com/microsoft/vscode-docker,2,0,1,1,0,microsoft,vscode-docker,1266,543,Unknown,docker;dockerfile;docker-compose;vscode
15,True,1,Language Packs,__lp_ms-vscode.js-debug;__lp_vscode;__lp_vscod...,43067642,4.82,https://github.com/Microsoft/vscode-loc,0,0,0,0,0,Microsoft,vscode-loc,644,352,Unknown,Unknown
16,True,0,Programming Languages;Machine Learning;AI;Chat,__ext_copilotmd;ai;autocomplete;c#;c++;chat-pa...,36350128,3.94,https://github.com/microsoft/vscode-copilot-re...,0,0,0,0,0,microsoft,vscode-copilot-release,976,132,Unknown,ai;vscode;copilot;copilot-x;copilot-chat
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23740,True,0,Programming Languages,__ext_st;__ext_stupid;__web_extension;stupid,38,0.00,https://github.com/MixusMinimax/stupid-code,0,0,0,0,0,MixusMinimax,stupid-code,0,0,Unknown,Unknown
23763,True,0,Extension Packs,__web_extension,37,0.00,https://github.com/yiyetech/yiye-front-extensi...,0,0,0,0,0,yiyetech,yiye-front-extension-pack,0,2,Unknown,Unknown
23951,True,0,Programming Languages,__ext_shs;__web_extension;shards,30,0.00,https://github.com/fragcolor-xyz/vscode-shards...,0,0,0,0,0,fragcolor-xyz,vscode-shards-syntax,0,1,Unknown,syntax;extension;vscode;highlight;shards
24062,True,0,Programming Languages,__ext_non;__web_extension;non;nonogram;Nonogra...,26,5.00,https://github.com/Logerfo/non-support,0,0,0,0,0,Logerfo,non-support,0,0,Unknown,Unknown


#### Clean null values

#### Clean Duplicating

Clean column names and remove columns

### 3. EDA

In [101]:
cols_num = df.select_dtypes(include = ['float','int']).columns.to_list()
cols_cat = df.select_dtypes(include = ['object', 'category']).columns.to_list()

In [102]:
df.select_dtypes("number").nunique().sort_values(ascending=False)

ext_install_count    9565
repo_stars            606
ext_rating            495
repo_forks            272
total_vulners          77
medium__vulners        52
low__vulners           48
ext_version            46
high__vulners          26
critical__vulners       1
dtype: int64

In [103]:
df.select_dtypes(exclude="number").nunique().sort_values(ascending=False)

repository        20576
repo_name         19594
repo_owner        16508
ext_tags          11520
repo_topics        4789
repo_languages     1580
ext_categories      317
verified              2
dtype: int64