使用四分位距和截距法进行异常值处理

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
linelevel_pre_false = pd.read_json('linelevel_pre_false.json')
linelevel_pre_true = pd.read_json('linelevel_pre_true.json')

linelevel_attribution_false = pd.read_json('linelevel_attribution_false_AfterAlert.json')
linelevel_attribution_true = pd.read_json('linelevel_attribution_true_AfterAlert.json')
linelevel_attribution_false['value'] = linelevel_attribution_false['value'] * 10
linelevel_attribution_true['value'] = linelevel_attribution_true['value'] * 10

In [3]:
len(linelevel_pre_false),len(linelevel_attribution_false)

(965, 3921)

In [4]:
len(linelevel_pre_true),len(linelevel_attribution_true)

(2174, 9202)

true

In [5]:
line_type = ['MethodDeclaration_num','variableDeclaration_num','expression_num','ifstatement_num',
             'forstatement_num','whilestatement_num','trystatement_num', 'returnstatement_num']

In [6]:
def get_list_print_df(df, name):
    col_list = []
    for i in range(len(df)):
        if df.loc[i]['key'] == name:
            attr = df.loc[i]['value']
            col_list.append(attr)
    df = pd.DataFrame(col_list, columns=[name])
    return df

In [7]:
def get_all_colunm_df(df,column_order):
    unique_values = df['key'].value_counts().index
    result = pd.DataFrame()

    for uniq in unique_values:
        attr = get_list_print_df(df, uniq)
        result = pd.concat([result, attr], axis=1)
        
    # 重新排序列名
    result = result[column_order]
    
    # 删除列名称中的 '_num' 字符串
    result.rename(columns=lambda x: x.replace('_num', ''), inplace=True)
    return result 

In [8]:
all_colunm_df_true = get_all_colunm_df(linelevel_attribution_true,line_type)
all_colunm_df_true.head()

KeyError: "['expression_num'] not in index"

In [None]:
mean_value = all_colunm_df_true.mean()
mean_value 

In [None]:
std_value = all_colunm_df_true.std()
std_value

In [None]:
# 画出和论文一样的图
def get_paper_picture(df):
    
    # 图形大小，随时改
    plt.figure(figsize=(40, 8))

    # 定义颜色
    palette = sns.color_palette('pastel')

    # 画图
    for i, col in enumerate(df.columns):
        # 小提琴
        sns.violinplot(x=np.full(len(df), i), y=df[col], inner=None, hue=np.full(len(df), i), palette=[palette[i]], legend=False)
        # 散点图
        sns.stripplot(x=np.full(len(df), i) - 0.5, y=df[col], color=palette[i], alpha=0.5, jitter=0.2, legend=False)
        # 箱线图
        sns.boxplot(x=np.full(len(df), i), y=df[col], width=0.2, showcaps=True, showfliers=False,  # 不显示异常值
                    boxprops=dict(facecolor='none', edgecolor='black', linewidth=1.5),  # 更改边框线条粗细
                    whiskerprops=dict(color='black', linewidth=2), 
                    capprops=dict(color='black', linewidth=2), 
                    medianprops=dict(color='#8B8B83', linewidth=2), 
                    flierprops=dict(marker='o', color='red', alpha=0.5))
    # 均值线
    mean_value = df.mean()
    print("均值 : ")
    print(mean_value)
    mean_value_sorted = mean_value.sort_values(ascending=False)
    print("\n从大到小排序后的均值 : ")
    print(mean_value_sorted)
    
    new_mean_value = [item for item in mean_value for _ in range(2)]
    for i in range(len(new_mean_value)):
        if i % 2 == 0:
            plt.plot([i - 0.5, i + 0.5], [new_mean_value[i], new_mean_value[i]], color='blue', linestyle='--', linewidth=2)
        else:
            plt.plot([i - 0.3, i + 0.3], [new_mean_value[i], new_mean_value[i]], color='blue', linestyle='--', linewidth=2)


    # 横坐标标签
    label_name = [item for sublist in [[x, f'{x}_num'] for x in df.columns] for item in sublist]
    ax = plt.gca()
    ax.set_xticks(np.arange(len(label_name)))  # 设置标签的位置
    ax.set_xticklabels(label_name)

    plt.ylabel('value')

    plt.show()


In [None]:
get_paper_picture(all_colunm_df_true)

In [None]:
all_colunm_df_true_abs = all_colunm_df_true.abs()
all_colunm_df_true_abs.head()

In [None]:
get_paper_picture(all_colunm_df_true_abs)

In [None]:
# 截尾处理


def Winsorization_outliers(df):
    total_nan_count_new = 0  # 初始化总共删除的数量

    for column_name in df.select_dtypes(include=[np.number]).columns:
        q1 = np.percentile(df[column_name].dropna(), 1)
        q3 = np.percentile(df[column_name].dropna(), 99)
        
        # 计算替换前的NaN数量
        nan_count_before = df[column_name].isna().sum()

        df[column_name] = df[column_name].apply(lambda x: np.nan if x < q1 or x > q3 else x)
        
        # 计算替换后的NaN数量
        nan_count_after = df[column_name].isna().sum()
        
        nan_count_new = nan_count_after - nan_count_before
               
        total_nan_count_new += nan_count_new    # 总共删除的数量
        
        print(f'列 {column_name} 的第1个分位数: {q1} \t第99个分位数: {q3}\t此列删除数量{nan_count_new}')
    print(f'所有列的总共删除的数量: {total_nan_count_new}')
    return df

all_colunm_df_true_for_Winsorization = all_colunm_df_true.copy()
all_colunm_df_true_Winsorization = Winsorization_outliers(all_colunm_df_true_for_Winsorization)

In [None]:
get_paper_picture(all_colunm_df_true_Winsorization)

In [None]:
all_colunm_df_Winsorization_true_abs = all_colunm_df_true_Winsorization.abs()
all_colunm_df_Winsorization_true_abs.head()

In [None]:
get_paper_picture(all_colunm_df_Winsorization_true_abs)

In [None]:
# 截尾处理 5%

def Winsorization_outliers_2(df):
    total_nan_count_new = 0  # 初始化总共删除的数量

    for column_name in df.select_dtypes(include=[np.number]).columns:
        q1 = np.percentile(df[column_name].dropna(), 5)
        q3 = np.percentile(df[column_name].dropna(), 95)
        
        # 计算替换前的NaN数量
        nan_count_before = df[column_name].isna().sum()

        df[column_name] = df[column_name].apply(lambda x: np.nan if x < q1 or x > q3 else x)
        
        # 计算替换后的NaN数量
        nan_count_after = df[column_name].isna().sum()
        
        nan_count_new = nan_count_after - nan_count_before
               
        total_nan_count_new += nan_count_new    # 总共删除的数量
        
        print(f'列 {column_name} 的第1个分位数: {q1} \t第99个分位数: {q3}\t此列删除数量{nan_count_new}')
    print(f'所有列的总共删除的数量: {total_nan_count_new}')
    return df

all_colunm_df_true_for_Winsorization_5 = all_colunm_df_true.copy()
all_colunm_df_true_Winsorization_5 = Winsorization_outliers_2(all_colunm_df_true_for_Winsorization_5)

get_paper_picture(all_colunm_df_true_Winsorization_5)

In [None]:
all_colunm_df_Winsorization_true_5_abs = all_colunm_df_true_Winsorization_5.abs()
get_paper_picture(all_colunm_df_Winsorization_true_5_abs)

In [None]:
# 四分位距法
def iqr_outliers(df):
    total_nan_count_new = 0  # 初始化总共删除的数量

    for column_name in df.select_dtypes(include=[np.number]).columns:
        q1 = df[column_name].quantile(0.25)
        q3 = df[column_name].quantile(0.75)
        iqr = q3 - q1
        Lower_tail = q1 - 1.5 * iqr
        Upper_tail = q3 + 1.5 * iqr
        
        # 计算替换前的NaN数量
        nan_count_before = df[column_name].isna().sum()

        df[column_name] = df[column_name].apply(lambda x: np.nan if x < Lower_tail or x > Upper_tail else x)
        
        # 计算替换后的NaN数量
        nan_count_after = df[column_name].isna().sum()
        
        nan_count_new = nan_count_after - nan_count_before
               
        total_nan_count_new += nan_count_new    # 总共删除的数量
        print(f'下四分位数: {q1}, 上四分位数: {q3}, 最小值下界: {Lower_tail}, 最大值上界: {Upper_tail}\t此列删除数量{nan_count_new}')
    print(f'所有列的总共删除的数量: {total_nan_count_new}')
    return df

all_colunm_df_true_for_iqr = all_colunm_df_true.copy()
all_colunm_df_true_iqr = iqr_outliers(all_colunm_df_true_for_iqr)

In [None]:
get_paper_picture(all_colunm_df_true_iqr)

In [None]:
all_colunm_df_iqr_true_abs = all_colunm_df_true_iqr.abs()
all_colunm_df_iqr_true_abs.head()

In [None]:
get_paper_picture(all_colunm_df_iqr_true_abs)

false

In [None]:
all_colunm_df_false = get_all_colunm_df(linelevel_attribution_false,line_type)
all_colunm_df_false.head()

In [None]:
std_value = all_colunm_df_false.std()
std_value

In [None]:
get_paper_picture(all_colunm_df_false)

绝对值均值

In [None]:
all_colunm_df_false_abs = all_colunm_df_false.abs()
all_colunm_df_false_abs.head()

In [None]:
get_paper_picture(all_colunm_df_false_abs)

In [None]:
all_colunm_df_false_for_Winsorization = all_colunm_df_false.copy()
all_colunm_df_false_Winsorization = Winsorization_outliers(all_colunm_df_false_for_Winsorization)

In [None]:
get_paper_picture(all_colunm_df_false_Winsorization)

In [None]:
all_colunm_df_false_for_Winsorization_5 = all_colunm_df_false.copy()
all_colunm_df_false_Winsorization_5 = Winsorization_outliers_2(all_colunm_df_false_for_Winsorization_5)

get_paper_picture(all_colunm_df_false_Winsorization_5)

In [None]:
all_colunm_df_Winsorization_false_5_abs = all_colunm_df_false_Winsorization_5.abs()
get_paper_picture(all_colunm_df_Winsorization_false_5_abs)

In [None]:
all_colunm_df_Winsorization_false_abs = all_colunm_df_false_Winsorization.abs()
all_colunm_df_Winsorization_false_abs.head()

In [None]:
get_paper_picture(all_colunm_df_Winsorization_false_abs)

In [None]:
all_colunm_df_false_for_iqr = all_colunm_df_false.copy()
all_colunm_df_false_iqr = iqr_outliers(all_colunm_df_false_for_iqr)

In [None]:
get_paper_picture(all_colunm_df_false_iqr)

In [None]:
all_colunm_df_iqr_false_abs = all_colunm_df_false_iqr.abs()
all_colunm_df_iqr_false_abs.head()

In [None]:
get_paper_picture(all_colunm_df_iqr_false_abs)