In [None]:
# Data handling
import numpy as np
import pandas as pd
import re

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker
import graphviz
import optuna
import optuna.visualization as vis
%matplotlib inline

# Stats
from statsmodels.graphics.gofplots import qqplot
import statsmodels.api as sm
import scipy.stats as st
from scipy.stats import shapiro, norm, chi2_contingency, kstest, boxcox

# Preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Models
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor,KNeighborsClassifier
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor,AdaBoostRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor, export_graphviz

# Metrics
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, root_mean_squared_error

#lib
from lib.clean_data_functions import clean_ext_version, clean_ext_publisher, clean_repo_publisher

#
from wordcloud import WordCloud,STOPWORDS
from ast import literal_eval
from collections import Counter

# os
import os

# time
import time

import warnings
# warnings.filterwarnings("ignore")    # (Optional)

print("Project has been created with Pandas: " ,pd. __version__," And with Numpy: ",np. __version__)

### Loading

In [2]:
import yaml

try:
    with open("../config.yaml", "r") as file:
        config = yaml.safe_load(file)
except:
    print("Yaml configuration file not found!")

In [None]:
df = pd.read_csv(config["data"]["clean"]["file_eda_cleaned"])
# df = df.sort_values(by = ["ext_install_count", "ext_rating"], ascending= False)
df.head()

### 5. Preprocessing 

In [None]:
cols_num = df.select_dtypes(include = ["int64","float64"]).columns.to_list()
cols_cat = df.select_dtypes(include = ["object","category"]).columns.to_list()

Handle duplicated

In [6]:
df.duplicated().sum()

np.int64(782)

Handle missing values

In [None]:
df.duplicated.sum()

### Outliers -> will be moved to Preprocessing not belong to EDA

**Outliers detection**

In [None]:
def detect_iqr_outliers(df: pd.DataFrame, column):
    out=[]
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    for x in df[column]:
        if x > upper_bound or x < lower_bound:
            out.append(x)
    return out

for col in cols_num:
    print(f" Total outliers in '{col}':  {len(detect_iqr_outliers(df, col))} \n")

**Outliers handle**

- IRQ-Outliers removing -> Delete observations, not gonna use it

In [None]:
# def remove_iqr_outliers(df: pd.DataFrame, column) -> pd.DataFrame:
#     Q1 = df[column].quantile(0.25)
#     Q3 = df[column].quantile(0.75)
#     IQR = Q3 - Q1
#     lower_bound = Q1 - 1.5 * IQR
#     upper_bound = Q3 + 1.5 * IQR
#     df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
#     return df

- Transform 

In [None]:
# Applying Box-Cox transformation to columns  with high skewness and positive values.

df["ext_install_count"], power_install_count = boxcox(df["ext_install_count"])

print(power_install_count)

# Plotting the transformed distributions
plt.figure(figsize=(10,5))

# Box-Cox transformed 'LotArea'
sns.histplot(df["ext_install_count"], kde=True)
plt.title("Box-Cox Transformed ext_install_count")

