In [2]:
import numpy as np
import pandas as pd
import scipy.stats as stats

# 1달간 다운로드 증감량 계산
def incdec_calculate(df):
    for day in range(1, 30):
        df.loc[:, f"Change_{day+1}Day"] = df.loc[:, f"{day+1}Day"] - df.loc[:, f"{day}Day"]
    return df

# 1달간 다운로드 상승률 계산
def incdecRate_calculate(df):
    for day in range(1, 30):
        df[f"ChangeRate_{day+1}Day"] = (df[f"Change_{day+1}Day"] / df[f"{day}Day"]) * 100
    return df

# 최근 29일간 누적 다운로드 수 계산
def cumulative_changes(df):
    df['Cumulative_Recent_29Day'] = df.loc[:, '1Day':'29Day'].cumsum(axis=1).iloc[:, -1]
    return df

df = pd.read_csv('20240617_Daily_Download.csv')
df_incdec = incdec_calculate(df)
df_incdec_Rate = incdecRate_calculate(df_incdec) 

df_incdec_Rate_copy = df_incdec_Rate.copy()

weights = {
    'ChangeRate_2Day': 1, 'ChangeRate_3Day': 2, 'ChangeRate_4Day': 3,
    'ChangeRate_5Day': 4, 'ChangeRate_6Day': 5, 'ChangeRate_7Day': 6,
    'ChangeRate_8Day': 7, 'ChangeRate_9Day': 8, 'ChangeRate_10Day': 9, 
    'ChangeRate_11Day': 10, 'ChangeRate_12Day': 15, 'ChangeRate_13Day': 20,
    'ChangeRate_14Day': 25, 'ChangeRate_15Day': 30, 'ChangeRate_16Day': 35,
    'ChangeRate_17Day': 40, 'ChangeRate_18Day': 50, 'ChangeRate_19Day': 60,
    'ChangeRate_20Day': 71, 'ChangeRate_21Day': 75, 'ChangeRate_22Day': 79,
    'ChangeRate_23Day': 83, 'ChangeRate_24Day': 87, 'ChangeRate_25Day': 90,
    'ChangeRate_26Day': 93, 'ChangeRate_27Day': 96, 'ChangeRate_28Day': 98,
    'ChangeRate_29Day': 100
}

df_incdec_Rate_copy.loc[:, 'Change_2Day_to_29Day_WeightedSum'] = sum(
    df_incdec_Rate_copy[col] * weight for col, weight in weights.items()
) / sum(weights.values())

df_incdec = df_incdec_Rate_copy.sort_values(by='Change_2Day_to_29Day_WeightedSum', ascending=False)

df_incdec['inc_score'] = range(len(df_incdec), 0, -1)
days_columns = [f"{i+1}Day" for i in range(30)]

ge_10000_df = df_incdec[df_incdec[days_columns].ge(10000).all(axis=1)]
ge_10000_df_cum = ge_10000_df.sort_values(by='inc_score', ascending=False)

ge_1000_df = df_incdec[df_incdec[days_columns].ge(1000).all(axis=1)]
ge_1000_df = ge_1000_df[~ge_1000_df.index.isin(ge_10000_df.index)]
ge_1000_df_cum = ge_1000_df.sort_values(by='inc_score', ascending=False)

ge_100_df = df_incdec[df_incdec[days_columns].ge(100).all(axis=1)]
ge_100_df = ge_100_df[~ge_100_df.index.isin(ge_1000_df.index.union(ge_10000_df.index))]
ge_100_df_cum = ge_100_df.sort_values(by='inc_score', ascending=False)

ge_50_df = df_incdec[df_incdec[days_columns].ge(50).all(axis=1)]
ge_50_df = ge_50_df[~ge_50_df.index.isin(ge_100_df.index.union(ge_1000_df.index).union(ge_10000_df.index))]
ge_50_df_cum = ge_50_df.sort_values(by='inc_score', ascending=False)

ge_10_df = df_incdec[df_incdec[days_columns].ge(10).all(axis=1)]
ge_10_df = ge_10_df[~ge_10_df.index.isin(ge_50_df.index.union(ge_100_df.index).union(ge_1000_df.index).union(ge_10000_df.index))]
ge_10_df_cum = ge_10_df.sort_values(by='inc_score', ascending=False)

ge_0_df = df_incdec[df_incdec[days_columns].ge(0).all(axis=1)]
ge_0_df = ge_0_df[~ge_0_df.index.isin(ge_10_df.index.union(ge_50_df.index).union(ge_100_df.index).union(ge_1000_df.index).union(ge_10000_df.index))]
ge_0_df_cum = ge_0_df.sort_values(by='inc_score', ascending=False)

dfs = [ge_10000_df_cum, ge_1000_df_cum, ge_100_df_cum, ge_50_df_cum, ge_10_df_cum, ge_0_df_cum]

merged_df = pd.concat(dfs, ignore_index=True)
merged_df['inc_sort_score'] = range(len(merged_df), 0, -1)

merged_df_copy = merged_df.copy()
df_incdec_WeightSum_cumulative = cumulative_changes(merged_df_copy)
sorted_df = df_incdec_WeightSum_cumulative.sort_values(by='Cumulative_Recent_29Day', ascending=False)
sorted_df['sum_score'] = range(len(sorted_df), 0, -1)

# 상위 데이터프레임에 포함된 행을 빼는 방식으로 데이터프레임 생성
ge_10000_df = sorted_df[sorted_df[days_columns].ge(10000).all(axis=1)]
ge_10000_df_cum = ge_10000_df.sort_values(by='Cumulative_Recent_29Day', ascending=False)

ge_1000_df = sorted_df[sorted_df[days_columns].ge(1000).all(axis=1)]
ge_1000_df = ge_1000_df[~ge_1000_df.index.isin(ge_10000_df.index)]
ge_1000_df_cum = ge_1000_df.sort_values(by='Cumulative_Recent_29Day', ascending=False)

ge_100_df = sorted_df[sorted_df[days_columns].ge(100).all(axis=1)]
ge_100_df = ge_100_df[~ge_100_df.index.isin(ge_1000_df.index.union(ge_10000_df.index))]
ge_100_df_cum = ge_100_df.sort_values(by='Cumulative_Recent_29Day', ascending=False)

ge_50_df = sorted_df[sorted_df[days_columns].ge(50).all(axis=1)]
ge_50_df = ge_50_df[~ge_50_df.index.isin(ge_100_df.index.union(ge_1000_df.index).union(ge_10000_df.index))]
ge_50_df_cum = ge_50_df.sort_values(by='Cumulative_Recent_29Day', ascending=False)

ge_10_df = sorted_df[sorted_df[days_columns].ge(10).all(axis=1)]
ge_10_df = ge_10_df[~ge_10_df.index.isin(ge_50_df.index.union(ge_100_df.index).union(ge_1000_df.index).union(ge_10000_df.index))]
ge_10_df_cum = ge_10_df.sort_values(by='Cumulative_Recent_29Day', ascending=False)

ge_0_df = sorted_df[sorted_df[days_columns].ge(0).all(axis=1)]
ge_0_df = ge_0_df[~ge_0_df.index.isin(ge_10_df.index.union(ge_50_df.index).union(ge_100_df.index).union(ge_1000_df.index).union(ge_10000_df.index))]
ge_0_df_cum = ge_0_df.sort_values(by='Cumulative_Recent_29Day', ascending=False)

len_ge_10000_df = len(ge_10000_df_cum)
len_ge_1000_df = len(ge_1000_df_cum)
len_ge_100_df = len(ge_100_df_cum)
len_ge_50_df = len(ge_50_df_cum)
len_ge_10_df = len(ge_10_df_cum)
len_ge_0_df = len(ge_0_df_cum)

print(f"Length of ge_10000_df: {len_ge_10000_df}")
print(f"Length of ge_1000_df: {len_ge_1000_df}")
print(f"Length of ge_100_df: {len_ge_100_df}")
print(f"Length of ge_50_df: {len_ge_50_df}")
print(f"Length of ge_10_df: {len_ge_10_df}")
print(f"Length of ge_0_df: {len_ge_0_df}")

# 병합을 위한 데이터프레임 리스트
dfs = [ge_10000_df_cum, ge_1000_df_cum, ge_100_df_cum, ge_50_df_cum, ge_10_df_cum, ge_0_df_cum]

# 모든 데이터프레임을 병합
merged_df = pd.concat(dfs, ignore_index=True)

merged_df['consistency_score'] = range(len(merged_df), 0, -1)
merged_df = merged_df.sort_values(by='Cumulative_Recent_29Day', ascending=False)

sum_score_list = merged_df['sum_score'].to_list()
inc_score_list = merged_df['inc_sort_score'].to_list()
consistency_score_list = merged_df['consistency_score'].to_list()

data = {
    'Total_Downloads_Rank': sum_score_list,
    'Growth_Rate_Rank': inc_score_list,
    'Consistency_Rank': consistency_score_list
}
df = pd.DataFrame(data)

# 스피어만 순위 상관계수 계산
spearman_corr_total_growth, p_value_total_growth = stats.spearmanr(df['Total_Downloads_Rank'], df['Growth_Rate_Rank'])
spearman_corr_total_consistency, p_value_total_consistency = stats.spearmanr(df['Total_Downloads_Rank'], df['Consistency_Rank'])
spearman_corr_growth_consistency, p_value_growth_consistency = stats.spearmanr(df['Growth_Rate_Rank'], df['Consistency_Rank'])

# 결과 출력
print(f"Spearman correlation between Total Downloads and Growth Rate: {spearman_corr_total_growth}, P-value: {p_value_total_growth}")
print(f"Spearman correlation between Total Downloads and Consistency: {spearman_corr_total_consistency}, P-value: {p_value_total_consistency}")
print(f"Spearman correlation between Growth Rate and Consistency: {spearman_corr_growth_consistency}, P-value: {p_value_growth_consistency}")

merged_df['row_mean'] = merged_df[['inc_sort_score', 'sum_score', 'consistency_score']].mean(axis=1)
mean_df = merged_df.sort_values(by='row_mean', ascending=False)
mean_df.to_csv('20240616_모델_순위.csv')

Length of ge_10000_df: 84
Length of ge_1000_df: 303
Length of ge_100_df: 769
Length of ge_50_df: 400
Length of ge_10_df: 1758
Length of ge_0_df: 2075
Spearman correlation between Total Downloads and Growth Rate: 0.6851864193315802, P-value: 0.0
Spearman correlation between Total Downloads and Consistency: 0.8253896429115307, P-value: 0.0
Spearman correlation between Growth Rate and Consistency: 0.9399274348585726, P-value: 0.0


In [None]:
import numpy as np
import pandas as pd
import scipy.stats as stats

class DownloadDataProcessor:
    def __init__(self, file_path):
        self.df = pd.read_csv(file_path)
        self.days_columns = [f"{i+1}Day" for i in range(30)]
        self.weights = {
            'ChangeRate_2Day': 1, 'ChangeRate_3Day': 2, 'ChangeRate_4Day': 3,
            'ChangeRate_5Day': 4, 'ChangeRate_6Day': 5, 'ChangeRate_7Day': 6,
            'ChangeRate_8Day': 7, 'ChangeRate_9Day': 8, 'ChangeRate_10Day': 9, 
            'ChangeRate_11Day': 10, 'ChangeRate_12Day': 15, 'ChangeRate_13Day': 20,
            'ChangeRate_14Day': 25, 'ChangeRate_15Day': 30, 'ChangeRate_16Day': 35,
            'ChangeRate_17Day': 40, 'ChangeRate_18Day': 50, 'ChangeRate_19Day': 60,
            'ChangeRate_20Day': 71, 'ChangeRate_21Day': 75, 'ChangeRate_22Day': 79,
            'ChangeRate_23Day': 83, 'ChangeRate_24Day': 87, 'ChangeRate_25Day': 90,
            'ChangeRate_26Day': 93, 'ChangeRate_27Day': 96, 'ChangeRate_28Day': 98,
            'ChangeRate_29Day': 100
        }
        
    def incdec_calculate(self):
        for day in range(1, 30):
            self.df.loc[:, f"Change_{day+1}Day"] = self.df.loc[:, f"{day+1}Day"] - self.df.loc[:, f"{day}Day"]
        return self.df

    def incdecRate_calculate(self):
        for day in range(1, 30):
            self.df[f"ChangeRate_{day+1}Day"] = (self.df[f"Change_{day+1}Day"] / self.df[f"{day}Day"]) * 100
        return self.df

    def cumulative_changes(self):
        self.df['Cumulative_Recent_29Day'] = self.df.loc[:, '1Day':'29Day'].cumsum(axis=1).iloc[:, -1]
        return self.df

    def process_data(self):
        self.incdec_calculate()
        self.incdecRate_calculate()

        self.df['Change_2Day_to_29Day_WeightedSum'] = sum(
            self.df[col] * weight for col, weight in self.weights.items()
        ) / sum(self.weights.values())

        self.df = self.df.sort_values(by='Change_2Day_to_29Day_WeightedSum', ascending=False)
        self.df['inc_score'] = range(len(self.df), 0, -1)

        ge_10000_df = self.df[self.df[self.days_columns].ge(10000).all(axis=1)]
        ge_10000_df_cum = ge_10000_df.sort_values(by='inc_score', ascending=False)

        ge_1000_df = self.df[self.df[self.days_columns].ge(1000).all(axis=1)]
        ge_1000_df = ge_1000_df[~ge_1000_df.index.isin(ge_10000_df.index)]
        ge_1000_df_cum = ge_1000_df.sort_values(by='inc_score', ascending=False)

        ge_100_df = self.df[self.df[self.days_columns].ge(100).all(axis=1)]
        ge_100_df = ge_100_df[~ge_100_df.index.isin(ge_1000_df.index.union(ge_10000_df.index))]
        ge_100_df_cum = ge_100_df.sort_values(by='inc_score', ascending=False)

        ge_50_df = self.df[self.df[self.days_columns].ge(50).all(axis=1)]
        ge_50_df = ge_50_df[~ge_50_df.index.isin(ge_100_df.index.union(ge_1000_df.index).union(ge_10000_df.index))]
        ge_50_df_cum = ge_50_df.sort_values(by='inc_score', ascending=False)

        ge_10_df = self.df[self.df[self.days_columns].ge(10).all(axis=1)]
        ge_10_df = ge_10_df[~ge_10_df.index.isin(ge_50_df.index.union(ge_100_df.index).union(ge_1000_df.index).union(ge_10000_df.index))]
        ge_10_df_cum = ge_10_df.sort_values(by='inc_score', ascending=False)

        ge_0_df = self.df[self.df[self.days_columns].ge(0).all(axis=1)]
        ge_0_df = ge_0_df[~ge_0_df.index.isin(ge_10_df.index.union(ge_50_df.index).union(ge_100_df.index).union(ge_1000_df.index).union(ge_10000_df.index))]
        ge_0_df_cum = ge_0_df.sort_values(by='inc_score', ascending=False)

        dfs = [ge_10000_df_cum, ge_1000_df_cum, ge_100_df_cum, ge_50_df_cum, ge_10_df_cum, ge_0_df_cum]
        merged_df = pd.concat(dfs, ignore_index=True)
        merged_df['inc_sort_score'] = range(len(merged_df), 0, -1)

        self.df = self.cumulative_changes()
        sorted_df = self.df.sort_values(by='Cumulative_Recent_29Day', ascending=False)
        sorted_df['sum_score'] = range(len(sorted_df), 0, -1)

        # 상위 데이터프레임에 포함된 행을 빼는 방식으로 데이터프레임 생성
        ge_10000_df = sorted_df[sorted_df[self.days_columns].ge(10000).all(axis=1)]
        ge_10000_df_cum = ge_10000_df.sort_values(by='Cumulative_Recent_29Day', ascending=False)

        ge_1000_df = sorted_df[sorted_df[self.days_columns].ge(1000).all(axis=1)]
        ge_1000_df = ge_1000_df[~ge_1000_df.index.isin(ge_10000_df.index)]
        ge_1000_df_cum = ge_1000_df.sort_values(by='Cumulative_Recent_29Day', ascending=False)

        ge_100_df = sorted_df[sorted_df[self.days_columns].ge(100).all(axis=1)]
        ge_100_df = ge_100_df[~ge_100_df.index.isin(ge_1000_df.index.union(ge_10000_df.index))]
        ge_100_df_cum = ge_100_df.sort_values(by='Cumulative_Recent_29Day', ascending=False)

        ge_50_df = sorted_df[sorted_df[self.days_columns].ge(50).all(axis=1)]
        ge_50_df = ge_50_df[~ge_50_df.index.isin(ge_100_df.index.union(ge_1000_df.index).union(ge_10000_df.index))]
        ge_50_df_cum = ge_50_df.sort_values(by='Cumulative_Recent_29Day', ascending=False)

        ge_10_df = sorted_df[sorted_df[self.days_columns].ge(10).all(axis=1)]
        ge_10_df = ge_10_df[~ge_10_df.index.isin(ge_50_df.index.union(ge_100_df.index).union(ge_1000_df.index).union(ge_10000_df.index))]
        ge_10_df_cum = ge_10_df.sort_values(by='Cumulative_Recent_29Day', ascending=False)

        ge_0_df = sorted_df[sorted_df[self.days_columns].ge(0).all(axis=1)]
        ge_0_df = ge_0_df[~ge_0_df.index.isin(ge_10_df.index.union(ge_50_df.index).union(ge_100_df.index).union(ge_1000_df.index).union(ge_10000_df.index))]
        ge_0_df_cum = ge_0_df.sort_values(by='Cumulative_Recent_29Day', ascending=False)

        dfs = [ge_10000_df_cum, ge_1000_df_cum, ge_100_df_cum, ge_50_df_cum, ge_10_df_cum, ge_0_df_cum]
        merged_df = pd.concat(dfs, ignore_index=True)

        merged_df['consistency_score'] = range(len(merged_df), 0, -1)
        merged_df = merged_df.sort_values(by='Cumulative_Recent_29Day', ascending=False)

        sum_score_list = merged_df['sum_score'].to_list()
        inc_score_list = merged_df['inc_sort_score'].to_list()
        consistency_score_list = merged_df['consistency_score'].to_list()

        data = {
            'Total_Downloads_Rank': sum_score_list,
            'Growth_Rate_Rank': inc_score_list,
            'Consistency_Rank': consistency_score_list
        }
        df = pd.DataFrame(data)

        # 스피어만 순위 상관계수 계산
        spearman_corr_total_growth, p_value_total_growth = stats.spearmanr(df['Total_Downloads_Rank'], df['Growth_Rate_Rank'])
        spearman_corr_total_consistency, p_value_total_consistency = stats.spearmanr(df['Total_Downloads_Rank'], df['Consistency_Rank'])
        spearman_corr_growth_consistency, p_value_growth_consistency = stats.spearmanr(df['Growth_Rate_Rank'], df['Consistency_Rank'])

        # 결과 출력
        print(f"Spearman correlation between Total Downloads and Growth Rate: {spearman_corr_total_growth}, P-value: {p_value_total_growth}")
        print(f"Spearman correlation between Total Downloads and Consistency: {spearman_corr_total_consistency}, P-value: {p_value_total_consistency}")
        print(f"Spearman correlation between Growth Rate and Consistency: {spearman_corr_growth_consistency}, P-value: {p_value_growth_consistency}")

        merged_df['row_mean'] = merged_df[['inc_sort_score', 'sum_score', 'consistency_score']].mean(axis=1)
        mean_df = merged_df.sort_values(by='row_mean', ascending=False)
        mean_df.to_csv('20240616_모델_순위.csv')

if __name__ == "__main__":
    processor = DownloadDataProcessor('20240617_Daily_Download.csv')
    processor.process_data()
