In [1]:
# Data handling
import numpy as np
import pandas as pd
import re

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker
import graphviz
import optuna
import optuna.visualization as vis
%matplotlib inline

# Stats
from statsmodels.graphics.gofplots import qqplot
from scipy.stats import shapiro, norm

# Preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Models
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor,KNeighborsClassifier
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor,AdaBoostRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor, export_graphviz

# Metrics
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, root_mean_squared_error

# typing
from typing import Dict,List

# os
import os

# time
import time

# warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
import yaml


try:
    with open("../config.yaml", "r") as file:
        config = yaml.safe_load(file)
except:
    print("Yaml configuration file not found!")

### 1. Loading

In [3]:
df_scraped = pd.read_csv(config["data"]["raw"]["file_scraped"])
df_verified = pd.read_csv(config["data"]["raw"]["file_verified"])
df_vulnerable = pd.read_csv(config["data"]["raw"]["file_vulnerable"])
# df_scraped.head()

In [None]:
raw_files = {"scraped": df_scraped, "verified": df_verified, "vulnerable": df_vulnerable}
for key, val in raw_files.items():
        print (f"Dimension of file '{key}': {val.shape}")


### 2. Cleaning

#### Convert date


In [None]:
df_scraped["last_updated"] = pd.to_datetime(df_scraped["last_updated"], errors ="coerce")

#### Clean column names and remove columns

In [None]:
raw_files = {"scraped": df_scraped, "verified": df_verified, "vulnerable": df_vulnerable}
for val in raw_files.values():
    val.columns = val.columns.str.strip().str.lower().str.replace(" ", "_")

In [None]:
df_scraped = df_scraped.drop(["id", "description"], axis = 1)
df_verified = df_verified.drop(["publisher", "install_count"],axis = 1).rename(columns = {"extension_name": "name","source_code":"repository"})
df_vulnerable = df_vulnerable.drop(["repository_name", "critical_vulnerability_names", "high_vulnerability_names", "medium_vulnerability_names", "low_vulnerability_names"],axis = 1).rename(columns = {"extension_name": "name", "repository_link":"repository"})

df_scraped.columns, df_verified.columns, df_vulnerable.columns

#### Check duplicated and remove duplicates

In [7]:
df_scraped = df_scraped.drop_duplicates(subset=["name","publisher"])
df_verified = df_verified.drop_duplicates(subset=["name","repository"])
df_vulnerable = df_vulnerable.drop_duplicates(subset=["name","repository"])

#### Check null values and drop null

In [8]:
df_verified = df_verified[~df_verified["repository"].isna()]
df_vulnerable = df_vulnerable[~df_vulnerable["name"].isna()]

In [None]:
raw_files = {"scraped": df_scraped, "verified": df_verified, "vulnerable": df_vulnerable}
for key, val in raw_files.items():
        print (f"Dimension of '{key}' after dropping null and duplicated: {val.shape}")

#### Combine raw df after cleaning 

In [12]:
df_pre_clean = pd.merge(df_verified,df_vulnerable, on=["name", "repository"])
df_clean = pd.merge(df_scraped, df_pre_clean, on = "name")
df_clean["repository"] = [x[:-4] if x.endswith(".git") else x for x in df_clean["repository"]]
df_clean.to_csv(config["data"]["clean"]["file_cleaned"])

In [None]:
df_clean.head()

In [None]:
df = pd.read_csv(config["data"]["clean"]["file_cleaned"])
df.shape

In [None]:
''