In [24]:
import pandas as pd
from IPython.display import display

# Načtení nahraného datasetu
file_path = 'Sorted_Client_Data.csv'
df = pd.read_csv(file_path)

# Odstranění duplicit podle client_id
df = df.drop_duplicates(subset='client_id')

# Konverze expected_revenue na numerický typ
df['expected_revenue'] = pd.to_numeric(df['expected_revenue'], errors='coerce')

# Filtrace klientů, kteří nejsou v defaultu
df = df[df['in_default'] == 'no']

# Skupina 1: Klienti s nejvyšším expected_revenue a prob_1 alespoň 0.3
group1 = df[df['prob_1'] >= 0.3].sort_values(by='expected_revenue', ascending=False)

# Skupina 2: Klienti s nejvyšším prob_1
group2 = df.sort_values(by='prob_1', ascending=False)

# Počet klientů v každé skupině
total_clients = 5000

# Výpočet metrik pro různé poměry
results = []

# For Loop, který přiděluje ratio
for g1_ratio in range(11):
    g2_ratio = 10 - g1_ratio
    n_g1 = (g1_ratio * total_clients) // 10
    n_g2 = total_clients - n_g1

    selected_group1 = group1.head(n_g1)
    selected_group2 = group2.head(n_g2)

    combined_group = pd.concat([selected_group1, selected_group2]).drop_duplicates(subset='client_id').head(total_clients)

    avg_prob_1 = combined_group['prob_1'].mean()
    total_expected_revenue = combined_group['expected_revenue'].sum()

    results.append({
        'group1_ratio': g1_ratio / 10,
        'group2_ratio': g2_ratio / 10,
        'avg_prob_1': avg_prob_1,
        'total_expected_revenue': total_expected_revenue
    })

# Převod výsledků na DataFrame pro snadnou analýzu
results_df = pd.DataFrame(results)

# Zobrazení DataFrame
display(results_df)

# Výběr top 5000 klientů podle prob_1
top_5000_clients = df.nlargest(5000, 'prob_1')

# Výpočet průměrné hodnoty prob_1 a celkových očekávaných příjmů
average_prob_1 = top_5000_clients['prob_1'].mean()
total_expected_revenue = top_5000_clients['expected_revenue'].sum()

print(f'Average prob_1: {average_prob_1}')
print(f'Total expected revenue: {total_expected_revenue}')

# Uložení top 5000 klientů do nového CSV souboru
output_file_path = 'Top_5000_Clients.csv'
top_5000_clients.to_csv(output_file_path, index=False)


Unnamed: 0,group1_ratio,group2_ratio,avg_prob_1,total_expected_revenue
0,0.0,1.0,0.719438,7489311.0
1,0.1,0.9,0.734926,6966852.0
2,0.2,0.8,0.740863,6891411.0
3,0.3,0.7,0.736188,7044386.0
4,0.4,0.6,0.724239,7300488.0
5,0.5,0.5,0.7042,7652774.0
6,0.6,0.4,0.683387,7971284.0
7,0.7,0.3,0.661241,8288507.0
8,0.8,0.2,0.639595,8555809.0
9,0.9,0.1,0.622172,8941178.0


Average prob_1: 0.719438
Total expected revenue: 7528812.992641432
