In [36]:
# Data handling
import numpy as np
import pandas as pd
import re

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker
import graphviz
import optuna
import optuna.visualization as vis
%matplotlib inline

# Stats
from statsmodels.graphics.gofplots import qqplot
import statsmodels.api as sm
import scipy.stats as st
from scipy.stats import shapiro, norm, chi2_contingency

# Preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Models
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor,KNeighborsClassifier
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor,AdaBoostRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor, export_graphviz

# Metrics
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, root_mean_squared_error

#lib
from lib.clean_data_functions import clean_ext_version

#List
from wordcloud import WordCloud,STOPWORDS
from ast import literal_eval

# os
import os

# time
import time

# warnings
import warnings
warnings.filterwarnings('ignore')

In [37]:
import yaml

try:
    with open("../config.yaml", "r") as file:
        config = yaml.safe_load(file)
except:
    print("Yaml configuration file not found!")

### 1. Loading

In [None]:
df = pd.read_csv(config["data"]["raw"]["file_ext_repo"])
# df = df.sort_values(by = ["ext_install_count", "ext_rating"], ascending= False)
df.head()

In [None]:
print(f'The dataset has {df.shape[0]} rows and {df.shape[1]} columns')

In [None]:
df.info()

#### Metadata:
- **_verified_:**                 check, whether extension's security is breached         (boolean)
- **_ext_name_:**                 name of extension                                       (obj)
- **_ext_publisher_:**            name of extension'S publisher                           (obj)
- **_ext_version_:**              current version of extension                            (obj)
- **_ext_categories_:**           categories of extension                                 (obj)   (multi values)
- **_ext_tags_:**                 keywords related to extension                           (obj)   (multi values)
- **_ext\_install\_count_:**      total number of installations of extension              (int64)
- **_ext\_rating_:**              rating of extension (avg of stars rating)               (float64)
- **_ext\_last\_updated_:**       timestamp of last update                                (obj)
- **_repo_publisher_:**           publisher of extension                                  (obj)
- **_repository_:**               url of repository                                       (obj)
- **_total_vulnerabilities_:**    number of detected vulnerabilities                      (int64)
- **_critical_:**                 number of critical(severity) vulnerabilities            (int64)
- **_high_:**                     number of high(severity) vulnerabilities                (int64)
- **_medium_:**                   number of medium(severity) vulnerabilities              (int64)
- **_low_:**                      number of low(severity) vulnerabilities                 (int64)
- **_repo\_owner_:**              owner of repository (via column repository)             (obj)
- **_repo\_name_:**               name of repository (via column repository)              (obj)
- **_repo\_stars_:**              number of stars of repository (via column repository)   (int64)   
- **_repo\_forks_:**              number of forks of repository (via column repository)   (int64)   
- **_language_:**                 program languages used (via column repository)          (obj)   (multi values)
- **_topics_:**                   keywords related to repository (via column repository)  (obj)   (multi values)
- **_error_:**                    log of fetching repository                              (obj)


### 2. Cleaning

In [None]:
# Check missing values
df.isna().sum()

- Remove record with values (not null) in "error" column due to unavailability of repository

In [None]:
df = df[df["error"].isna()]
print(f"After removing extensions with unavailable repository, the dataset has {df.shape[0]} rows and {df.shape[1]} columns")

#### Clean columns

- Rename and remove columns for noises and avoiding of overfit ["ext\_tags", "repo\_publisher", "error"]

In [43]:
# Use  "repo\_owner" over "repo\_publisher" due to correctness from fetching infos directly)
df = df.drop(columns=["ext_tags", "ext_last_updated", "repo_publisher", "error"]).rename(columns={"total_vulnerabilities": "total_vulners", "critical": "critical__vulners", "high": "high__vulners", "medium": "medium__vulners", "low": "low__vulners", "language": "repo_languages", "topics": "repo_topics"})

In [None]:
df

In [None]:
df.info()

In [46]:
df["ext_version"] = df["ext_version"].apply(clean_ext_version)
df['ext_version'] = pd.to_numeric(df['ext_version'], errors="coerce")

In [None]:
df["ext_version"].value_counts()

In [None]:
# df[(df["ext_version"] > 10) & (df["ext_rating"] == 0) & (df["ext_install_count"] < 100) & (df["repo_stars"] < 10)]
df[(df["ext_rating"] == 0) & (df["ext_install_count"] < 10) & (df["repo_stars"] < 10)]

In [30]:
cols_num = df.select_dtypes(include = ['float','int']).columns.to_list()
cols_cat = df.select_dtypes(include = ['object', 'category']).columns.to_list()

In [None]:
# df['repo_topics'] = df['repo_topics'].apply(lambda x: literal_eval(x) if pd.notna(x) else [])
# df['repo_topics'] = df['repo_topics'].apply(lambda x: ', '.join(map(str, x)) if x else "Unknown")

Check duplicated and remove

Clean column names and remove columns

Check null values and drop na

Check values in column

### 3. EDA