In [10]:
# Data handling
import numpy as np
import pandas as pd
import re

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker
import graphviz
import optuna
import optuna.visualization as vis
%matplotlib inline

# Stats
from statsmodels.graphics.gofplots import qqplot
import statsmodels.api as sm
import scipy.stats as st
from scipy.stats import shapiro, norm, chi2_contingency

# Preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Models
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor,KNeighborsClassifier
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor,AdaBoostRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor, export_graphviz

# Metrics
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, root_mean_squared_error

#lib
from lib.clean_data_functions import clean_ext_version, clean_ext_publisher, clean_repo_publisher

#
from wordcloud import WordCloud,STOPWORDS
from ast import literal_eval
from collections import Counter

# os
import os

# time
import time

import warnings
# warnings.filterwarnings("ignore")    # (Optional)

print("Project has been created with Pandas: " ,pd. __version__," And with Numpy: ",np. __version__)

Project has been created with Pandas:  2.3.1  And with Numpy:  2.3.2


In [11]:
import yaml

try:
    with open("../config.yaml", "r") as file:
        config = yaml.safe_load(file)
except:
    print("Yaml configuration file not found!")

### 1. Loading

In [12]:
df = pd.read_csv(config["data"]["clean"]["file_data_cleaned"])
# df.head()

Unnamed: 0,ext_version,ext_categories,ext_install_count,ext_rating,verified,repository,total_vulners,critical__vulners,high__vulners,medium__vulners,low__vulners,repo_owner,repo_name,repo_stars,repo_forks,repo_languages
0,2025,programming languages;debuggers;data science;m...,177844528,4.19,True,https://github.com/Microsoft/vscode-python,0,0,0,0,0,Microsoft,vscode-python,4461,1247,typescript;python;javascript;jupyter notebook;...
1,2025,programming languages,146473811,3.01,True,https://github.com/microsoft/pylance-release,0,0,0,0,0,microsoft,pylance-release,1882,856,python;jupyter notebook
2,2025,extension packs;data science;machine learning;...,93931981,2.7,True,https://github.com/Microsoft/vscode-jupyter,0,0,0,0,0,Microsoft,vscode-jupyter,1401,337,typescript;python;jupyter notebook;javascript;...
3,1,programming languages;snippets;linters;debugge...,85413454,3.43,True,https://github.com/Microsoft/vscode-cpptools,11,0,0,0,11,Microsoft,vscode-cpptools,5873,1633,typescript;c++;html;javascript
4,1,notebooks,74084334,4.0,True,https://github.com/Microsoft/vscode-jupyter-ke...,17,0,0,9,8,Microsoft,vscode-jupyter-keymap,32,14,unknown


In [13]:
print(f'The dataset has {df.shape[0]} rows and {df.shape[1]} columns')

The dataset has 19553 rows and 16 columns


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19553 entries, 0 to 19552
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ext_version        19553 non-null  int64  
 1   ext_categories     19553 non-null  object 
 2   ext_install_count  19553 non-null  int64  
 3   ext_rating         19553 non-null  float64
 4   verified           19553 non-null  bool   
 5   repository         19553 non-null  object 
 6   total_vulners      19553 non-null  int64  
 7   critical__vulners  19553 non-null  int64  
 8   high__vulners      19553 non-null  int64  
 9   medium__vulners    19553 non-null  int64  
 10  low__vulners       19553 non-null  int64  
 11  repo_owner         19553 non-null  object 
 12  repo_name          19553 non-null  object 
 13  repo_stars         19553 non-null  int64  
 14  repo_forks         19553 non-null  int64  
 15  repo_languages     18313 non-null  object 
dtypes: bool(1), float64(1)

#### Metadata:
- **_verified_:**                 check, whether extension's security is breached         (boolean)
- **_ext_categories_:**           categories of extension                                 (obj)   (multi values)
- **_ext\_install\_count_:**      total number of installations of extension              (int64)
- **_ext\_rating_:**              rating of extension (avg of stars rating)               (float64)
- **_repository_:**               url of repository                                       (obj)
- **_total\_vulners_:**           number of detected vulnerabilities                      (int64)
- **_critical\_vulners_:**        number of critical(severity) vulnerabilities            (int64)
- **_high\_vulners_:**            number of high(severity) vulnerabilities                (int64)
- **_medium\_vulners_:**          number of medium(severity) vulnerabilities              (int64)
- **_low\_vulners_:**             number of low(severity) vulnerabilities                 (int64)
- **_repo\_owner_:**              owner of repository (via column repository)             (obj)
- **_repo\_name_:**               name of repository (via column repository)              (obj)
- **_repo\_stars_:**              number of stars of repository (via column repository)   (int64)   
- **_repo\_forks_:**              number of forks of repository (via column repository)   (int64)   
- **repo\_languages:**            program languages used (via column repository)          (obj)   (multi values)


### 3. EDA

In [15]:
# cols_num = df.select_dtypes(include = ['float','int']).columns.to_list()
# cols_cat = df.select_dtypes(include = ['object', 'category']).columns.to_list()

In [16]:
# df.select_dtypes("number").nunique().sort_values(ascending=False)

In [17]:
# df.select_dtypes(exclude="number").nunique().sort_values(ascending=False)

In [18]:
# languages_counter=Counter(languages_list)
# languages_counter.most_common(12)