In [None]:
# Data handling
import numpy as np
import pandas as pd

#lib
from lib.clean_data_functions import clean_ext_version, clean_ext_publisher, clean_repo_publisher, filter_string

#
from ast import literal_eval
from collections import Counter

import warnings
warnings.filterwarnings("ignore")    # (Optional)

print("Project has been created with Pandas: " ,pd. __version__," And with Numpy: ",np. __version__)

In [2]:
import yaml

try:
    with open("../config.yaml", "r") as file:
        config = yaml.safe_load(file)
except:
    print("Yaml configuration file not found!")

### 1. Loading

In [None]:
df = pd.read_csv(config["data"]["raw"]["file_ext_repo"])
# df = df.sort_values(by = ["ext_install_count", "ext_rating"], ascending= False)
df.head()

In [None]:
print(f'The dataset has {df.shape[0]} rows and {df.shape[1]} columns')

In [None]:
df.info()

#### Metadata:
- **_verified_:**                 check, whether extension's security is breached         (boolean)
- **_ext_name_:**                 name of extension                                       (obj)
- **_ext_publisher_:**            name of extension'S publisher                           (obj)
- **_ext_version_:**              current version of extension                            (obj)
- **_ext_categories_:**           categories of extension                                 (obj)   (multi values)
- **_ext_tags_:**                 keywords related to extension                           (obj)   (multi values)
- **_ext\_install\_count_:**      total number of installations of extension              (int64)
- **_ext\_rating_:**              rating of extension (avg of stars rating)               (float64)
- **_ext\_last\_updated_:**       timestamp of last update                                (obj)
- **_repo_publisher_:**           publisher of extension                                  (obj)
- **_repository_:**               url of repository                                       (obj)
- **_total_vulnerabilities_:**    number of detected vulnerabilities                      (int64)
- **_critical_:**                 number of critical(severity) vulnerabilities            (int64)
- **_high_:**                     number of high(severity) vulnerabilities                (int64)
- **_medium_:**                   number of medium(severity) vulnerabilities              (int64)
- **_low_:**                      number of low(severity) vulnerabilities                 (int64)
- **_repo\_owner_:**              owner of repository (via column repository)             (obj)
- **_repo\_name_:**               name of repository (via column repository)              (obj)
- **_repo\_stars_:**              number of stars of repository (via column repository)   (int64)   
- **_repo\_forks_:**              number of forks of repository (via column repository)   (int64)   
- **_language_:**                 program languages used (via column repository)          (obj)   (multi values)
- **_topics_:**                   keywords related to repository (via column repository)  (obj)   (multi values)
- **_error_:**                    log of fetching repository                              (obj)


### 2. Cleaning

In [None]:
# Check missing values
df.isna().sum()

In [None]:
df.columns[df.isnull().mean() > 0.8]

Remove record with values (not null) in "error" column due to unavailability of repository

In [None]:
df = df[df["error"].isna()]
print(f"After removing extensions with unavailable repository, the dataset has {df.shape[0]} rows and {df.shape[1]} columns")

#### Clean columns

Rename and remove columns for noises and avoiding of overfit ["ext\_tags", "repo\_publisher", "error"]

In [9]:
# Use  "repo\_owner" over "repo\_publisher" due to correctness from fetching infos directly)
df = df.drop(columns=["ext_name","ext_publisher","ext_tags","ext_last_updated", "repo_publisher", "error"]).rename(columns={"total_vulnerabilities": "total_vulners", "critical": "critical_vulners", "high": "high_vulners", "medium": "medium_vulners", "low": "low_vulners", "language": "repo_languages", "topics": "repo_topics"})

In [11]:
# df.info()

#### Clean data inconsistencies

In [12]:
#ext_version

df["ext_version"] = df["ext_version"].apply(clean_ext_version)
df['ext_version'] = pd.to_numeric(df['ext_version'], errors="coerce")

In [13]:
#ext_rating
df["ext_rating"] = df["ext_rating"].apply(lambda x: round(x,2) if pd.notna(x) else pd.NA )

In [15]:
#repo_languages
# eval the obj[] and transform to str
df["repo_languages"] = df["repo_languages"].apply(lambda x: literal_eval(x) if pd.notna(x) else [])
df["repo_languages"] = df["repo_languages"].apply(lambda x: ';'.join(map(str, x)) if x else x)
df["repo_languages"] = df["repo_languages"].str.lower()
# prog_languages = languages_set

In [16]:
#repo_topics
df["repo_topics"] = df["repo_topics"].apply(lambda x: literal_eval(x) if pd.notna(x) else [])
df["repo_topics"] = df["repo_topics"].apply(lambda x: ';'.join(map(str, x)) if x else pd.NA)
df["repo_topics"] = df["repo_topics"].str.lower()


In [19]:
repo_languages = ""
for lang in df["repo_languages"]:
    if not pd.isna(lang):
        repo_languages += lang + ";"
    
repo_languages_list = repo_languages.split(';')
repo_languages_set  = set(repo_languages_list)

In [20]:
# Fill NA in language where topics is not NA with parts of topics in languages
mask = pd.isna(df['repo_languages']) & ~pd.isna(df['repo_topics'])
df.loc[mask, 'repo_languages'] = df.loc[mask, 'repo_topics'].apply(lambda x: filter_string(x, repo_languages_set))

In [None]:
df["repo_languages"].isna().sum()/df["repo_languages"].shape[0]
# sorted(languages_set)

NA in column "repo_languages" is 24,6%. We will try to fill missing values in languages with topic as part of languages

#### Handle missing/na values

In [22]:
df["repo_languages"] = df["repo_languages"].fillna("unknown")

#### Handle duplicating


In [23]:
# df.duplicated().sum()
df = df[~df.duplicated(subset=["repo_owner","repo_stars", "repo_forks"])]

#### Transform

In [None]:
# df[df["repo_languages"].isna()]
df = df.drop(columns="repo_topics")
# df[(df["repo_languages"].isna()) & df["verified"] == True]
df

In [25]:
df.to_csv(config["data"]["clean"]["file_data_cleaned"],index=False )

In [26]:
# languages = ""
# for lang in df["repo_languages"]:
#     if not pd.isna(lang):
#         languages += lang + ";"
    
# languages_list = languages.split(';')
# languages_set  = set(languages_list)

In [27]:

# categories = ""
# for cate in df["ext_categories"]:
#     categories += cate + ";"

# categories_list = categories.split(';')
# categories_set  = set(categories_list)

# categories_set