In [None]:
import numpy as np # for arrays, Series, List, Sets
import pandas as pd # for dataframes

#- 9 meses: 26483
#- 9 meses without package: 26006 
#- 9 meses without cobranding: 23646 
#- todo junto: 23191

In [None]:
data = pd.read_csv('data.csv', sep='|', skipfooter=1, engine='python')

In [None]:
data.shape

In [None]:
data['client_id'].nunique()

In [None]:
sorted(data['Month'].unique())

In [None]:
client_month_counts = data.groupby('client_id')['Month'].nunique()
print(client_month_counts)

In [None]:
clients_with_9_months = client_month_counts[client_month_counts == 9].index
print(clients_with_9_months.shape)

In [None]:
data2 = data[data['client_id'].isin(clients_with_9_months)]

data2['client_id'].nunique()
# data2: Dataframe con solo clientes con 9 meses

In [None]:
data2.head()

In [None]:
data2.shape

In [None]:
data2['CreditCard_CoBranding'].value_counts().reset_index()

In [None]:
# 6 meses training, 1 mes lead, 2 meses prediction
mask = (data2['Month'] == '2019-01-01') & (data2['CreditCard_CoBranding'] != 'No')

clients_to_remove = data2.loc[mask, 'client_id'].unique()

data3 = data2[~data2['client_id'].isin(clients_to_remove)]

data3['client_id'].nunique()
# data3: Dataframe con clientes sin cobranding en el mes 6. Solo clientes con 9 meses.

In [None]:
data3.shape

In [None]:
mask2 = (data3['Month'] == '2019-01-01') & (data3['Package_Active'] != 'No')

clients_to_remove2 = data3.loc[mask2, 'client_id'].unique()

data4 = data3[~data3['client_id'].isin(clients_to_remove2)]

data4['client_id'].nunique()
# data4: clientes sin paquete ni cobranding en el mes 6. Solo clientes con 9 meses.

In [None]:
data4['Region'].value_counts(dropna=False).reset_index()
# detectados 162349 nulos en Region:

In [None]:
data4.head()

# Algunos Identity Features:
# client_id, Month, Sex

# Algunos Transform Featues:
# Client_Age_grp, First_product_dt, Last_product_dt

In [None]:
data5 = data4.copy()

data_only_one = data5[(data5['client_id'] == 5856970)]
data_only_one[['client_id', 'Region', 'CreditCard_Product', 'Month']]

# se ven muchos clientes con region y creditProduct solo en los meses 2019-03 y 2019-04:

In [None]:
data_region = data5[(data5['Month'] == '2019-04-01')]
data_region[['Region', 'Month']].value_counts(dropna=False).reset_index()

# existen 6 clientes que no tienen Region para el mes 2019-04:

In [None]:
result_clients = data5[(data5['Month'] == '2019-04-01') & (data5['Region'].isna())]['client_id']
result_clients
# clientes sin regiones registradas en el mes 2019-04

In [None]:
# se transfieren las 2 regiones repetidas de cada cliente, a todo el resto de sus meses
# para eliminar la mayoria de nulos en Region

client_region_map = data5[data5['Month'] == '2019-04-01'].set_index('client_id')['Region'].to_dict()

data5['Region'] = data5['client_id'].map(client_region_map)


data_only_one = data5[(data5['client_id'] == 5856970)]
data_only_one[['client_id', 'Region', 'CreditCard_Product', 'Month']]

In [None]:
# se inserta Desconocido en los 6 clientes con region desconocida
# para eliminar los últimos nulos en Region

data5.loc[data5['client_id'].isin(result_clients), 'Region'] = 'Desconocido'

data_region2 = data5[(data5['Month'] == '2019-04-01')]
data_region2[['Region', 'Month']].value_counts(dropna=False).reset_index()

In [None]:
data5['Region'].value_counts(dropna=False).reset_index()

# en resumen, paso de 162349 NaN, a 54 desconocidos

In [None]:
# data5: Dataframe sin nulos en Region
data5[['CreditCard_Product']].value_counts(dropna=False).reset_index()
# detectados 178520 nulos en CreditCard_Product:

In [None]:
data_credit = data5[(data5['Month'] == '2019-04-01')]
data_credit[['CreditCard_Product', 'Month']].value_counts(dropna=False).reset_index()
# existen 8102 clientes que no tienen CreditCard_Product para el mes 2019-04:

In [None]:
result_clients2 = data5[(data5['Month'] == '2019-04-01') & (data5['CreditCard_Product'].isna())]['client_id']
result_clients2
# clientes sin credit_product registradas en el mes 2019-04:

In [None]:
# se transfieren las 2 CreditCard_Product repetidos en 2019-03 y 2019-04 de cada cliente, a todo el resto de sus meses

data6 = data5.copy()

client_credit_map = data6[data6['Month'] == '2019-04-01'].set_index('client_id')['CreditCard_Product'].to_dict()

data6['CreditCard_Product'] = data6['client_id'].map(client_credit_map)


data_only_one2 = data6[(data6['client_id'] == 5856970)]
data_only_one2[['client_id', 'Region', 'CreditCard_Product', 'Month']]

In [None]:
# se inserta Desconocido en los 8102 clientes con CreditCard_Product desconocido

data6.loc[data6['client_id'].isin(result_clients2), 'CreditCard_Product'] = 'Desconocido'

data_credit2 = data6[(data6['Month'] == '2019-04-01')]
data_credit2[['CreditCard_Product', 'Month']].value_counts(dropna=False).reset_index()

In [None]:
data6[['CreditCard_Product']].value_counts(dropna=False).reset_index()

# en resumen, paso de 178520 NaN, a 72918 desconocidos

In [None]:
# data6: Dataframe sin nulos en Region ni CreditCard_Product

# Checkeo de outliers
pd.set_option('display.max_columns', None)
data6.head()

In [None]:
data6[['SavingAccount_Balance_FirstDate']].value_counts(dropna=False).reset_index()

In [None]:
data6_ordered = data6.sort_values('SavingAccount_Balance_FirstDate')
data6_ordered[['SavingAccount_Balance_FirstDate']]

In [None]:
data6['SavingAccount_Balance_FirstDate'].value_counts().plot(kind='box')

In [None]:
data6['SavingAccount_Balance_FirstDate'].describe().apply(lambda x: format(x, 'f'))

In [None]:
[x for x in data6.columns if (x.startswith('SavingAccount_') or x.startswith('Operations_') or x.startswith('CreditCard_'))]

In [None]:
data6['CreditCard_Total_Limit'].value_counts(dropna=False).reset_index()#describe().apply(lambda x: format(x, 'f'))

#### muchos valores
CreditCard_Revolving
CreditCard_Spending_Aut_Debits
CreditCard_Spending_CrossBoarder
CreditCard_Spending_Installments
CreditCard_Spending_1_Installment
CreditCard_Total_Spending
CreditCard_Balance_DOLLAR
CreditCard_Balance_ARG
SavingAccount_Balance_Average
SavingAccount_Balance_FirstDate
SavingAccount_Balance_LastDate
SavingAccount_Salary_Payment_Amount
SavingAccount_Transfer_In_Amount
SavingAccount_ATM_Extraction_Amount
SavingAccount_Service_Payment_Amount
SavingAccount_CreditCard_Payment_Amount
SavingAccount_Transfer_Out_Amount
SavingAccount_DebitCard_Spend_Amount
SavingAccount_Total_Amount
SavingAccount_Credits_Amounts
SavingAccount_Debits_Amounts

#### valores concretos grandes
CreditCard_Total_Limit

In [None]:
data6['CreditCard_Total_Limit'].describe()#.apply(lambda x: format(x, 'f'))

In [None]:
columnas_con_outliers = [col for col in data6.columns if (data6[col].nunique() > 200) and (data6[col].dtype == 'float64')]
print(columnas_con_outliers)

In [None]:
print(len(columnas_con_outliers))

In [None]:
# data6[data6['client_id'] == 5856970][['client_id', 'Month', 'SavingAccount_Balance_Average']].reset_index()

data6['SavingAccount_Balance_Average'].describe().apply(lambda x: format(x, 'f'))

In [None]:
# ejemplo de borrar outliers
p95 = data6[data6.SavingAccount_Balance_Average > 120.0]['SavingAccount_Balance_Average'].quantile(0.95)
p99 = data6[data6.SavingAccount_Balance_Average > 120.0]['SavingAccount_Balance_Average'].quantile(0.99)
three_sigma = 3 * data6[data6.SavingAccount_Balance_Average > 139.4]['SavingAccount_Balance_Average'].std()

print('p95		', p95)
print('p99		', p99)
print('three_sigma	', three_sigma)

In [None]:
sin_outlier = np.where(data6['SavingAccount_Balance_Average'] > three_sigma, three_sigma, data6['SavingAccount_Balance_Average'])
sin_outlier_df = pd.DataFrame(sin_outlier, columns=['SavingAccount_Balance_Average'])
sin_outlier_df.describe()

In [None]:
data7 = data6.copy()
for column in columnas_con_outliers:
    three_sigma = 3 * data6[data6[column] > 1][column].std()
    data7[column] = np.where(data6[column] > three_sigma, three_sigma, data6[column])

# data7: nuevo dataframe sin outliers en las 21 columnas

In [None]:
data7['SavingAccount_Balance_Average'].describe().apply(lambda x: format(x, 'f'))

data7.head()

##### Identity features, columnas para pasar directamente:
(todas en el ultimo mes de training)
client_id
Sex
Region
Mobile
Email
CreditCard_Product


##### Transform features, agrupamientos, o cantidad de algo
numero de insurance y active bools en yes. Opcional convertir a dtype bool para Numero de bools totales en yes
Client_Age_grp, pasar a numero
diferencia en dias de First_product_dt - Last_product_dt. Diferencia del ultimo con ultimo mes de training

##### Aggregate features
operaciones, sum avg min max etc, para 6 y para 3 meses, generadas automaticamente, para todas las numericas

In [None]:
# creacion de la ABT
# Ultimo mes de training '2019-01-01'
ultimomes = '2019-01-01'
data_ultimomes = data7[data7['Month'] == ultimomes].copy()
AB_table = data_ultimomes[['client_id', 'Sex', 'Region', 'Mobile', 'Email', 'CreditCard_Product', 'Target']].copy()

AB_table.head()
print(AB_table['client_id'].nunique())
print(AB_table.shape)

In [None]:
sorted(data7['Client_Age_grp'].unique())

In [None]:
# Cargar edades
age_mapping = {
    'Menor a 18 años': 0,
    'Entre 18 y 29 años': 1,
    'Entre 30 y 39 años': 2,
    'Entre 40 y 49 años': 3,
    'Entre 50 y 59 años': 4,
    'Entre 60 y 64 años': 5,
    'Entre 65 y 69 años': 6,
    'Mayor a 70 años': 7
}
data_ultimomes.loc[:, 'Client_Age_grp'] = data_ultimomes['Client_Age_grp'].map(age_mapping)

AB_table = pd.merge(AB_table, data_ultimomes[['client_id', 'Client_Age_grp']], on='client_id', how='inner')

AB_table.head()

In [None]:
# sorted(data7['First_product_dt'], reverse=True)
data7[data7['client_id'] == 6370097][['Month', 'First_product_dt']]
# no se que significa que a un cliente le cambie la fecha de su primer producto

In [None]:
data7[data7['First_product_dt'] == '2019-04-17']

In [None]:
# carga de fechas en número
from datetime import datetime

data_ultimomes['Last_product_dt'] = pd.to_datetime(data_ultimomes['Last_product_dt'])
data_ultimomes['First_product_dt'] = pd.to_datetime(data_ultimomes['First_product_dt'])

data_ultimomes['days_last_first'] = (data_ultimomes['Last_product_dt'] - data_ultimomes['First_product_dt']).dt.days

today_date = datetime.strptime(ultimomes, '%Y-%m-%d')
data_ultimomes['days_today_last'] = (today_date - data_ultimomes['Last_product_dt']).dt.days
data_ultimomes['days_today_last'] = data_ultimomes['days_today_last'].apply(lambda x: 0 if x < 0 else x)


AB_table = pd.merge(AB_table, data_ultimomes[['client_id', 'days_last_first', 'days_today_last']], on='client_id', how='inner')

AB_table.head()


In [None]:
AB_table['days_last_first'].describe()

In [None]:
AB_table['days_today_last'].describe()

In [None]:
# cargar conteo de bools

# [col for col in data7.columns if data7[col].dtype == 'bool']
active_columns = [x for x in data7.columns if 'Active' in x]
insurance_columns = [x for x in data7.columns if x.startswith('Insurance_')]

data_ultimomes['active_count'] = (data_ultimomes[active_columns] == 'Yes').sum(axis=1)
data_ultimomes['insurance_count'] = (data_ultimomes[insurance_columns] == 'Yes').sum(axis=1)
# data_ultimomes[data_ultimomes['client_id'] == 6370097].head()

AB_table = pd.merge(AB_table, data_ultimomes[['client_id', 'active_count', 'insurance_count']], on='client_id', how='inner')

AB_table.head()

In [None]:
AB_table['insurance_count'].describe()

In [None]:
# en el mes '2019-01-01' nadie tiene insurance (a diferencia de '2019-03-01' y '2019-04-01' si), borro la columna
AB_table = AB_table.drop('insurance_count', axis=1)
print(AB_table.shape)

In [None]:
# cargar aggregate features

seis_meses = ['2018-08-01',
             '2018-09-01',
             '2018-10-01',
             '2018-11-01',
             '2018-12-01',
             '2019-01-01']
tres_meses = ['2018-11-01',
             '2018-12-01',
             '2019-01-01']

seis_data = data7[data7['Month'].isin(seis_meses)]
tres_data = data7[data7['Month'].isin(tres_meses)]

columnas_agg = [col for col in data7.columns if ((data7[col].dtype == 'float64') and (col not in (['Target', 'client_id'])))]

# np.sum, np.amax, np.min, np.mean, np.median, np.count_nonzero, 'nunique', np.var
agg_feat6 = seis_data.groupby(['client_id'])[columnas_agg].agg([np.sum, np.amax, np.min, np.mean, np.median, np.count_nonzero, 'nunique', np.var])
agg_feat3 = tres_data.groupby(['client_id'])[columnas_agg].agg([np.sum, np.amax, np.min, np.mean, np.median, np.count_nonzero, 'nunique', np.var])

agg_feat6.columns = ['_seis_'.join(x) for x in np.array(agg_feat6.columns)]
agg_feat3.columns = ['_tres_'.join(x) for x in np.array(agg_feat3.columns)]


In [None]:
# agg_feat.head(20).T

agg_feat = pd.merge(agg_feat6, agg_feat3[agg_feat3.columns], on='client_id', how='inner')

agg_feat.head()

In [None]:
agg_feat.shape

In [None]:
agg_feat_corr = agg_feat.reset_index().copy()
# Create correlation matrix

corr_matrix = agg_feat_corr.drop('client_id', axis=1).corr().abs()

# Select upper triangle of correlation matrix

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)).fillna(0)

# Find features with correlation greater than 0.80

to_drop = [column for column in upper.columns if any(upper[column] > 0.80)]

upper.reset_index().to_csv(r'C:\Users\Nico\Documents\Jupyter\corr\correlacion.csv' , 
                           sep='|',  header=True , encoding='ANSI', index=False)

print(to_drop)

# Drop features 

agg_feat_corr.drop(to_drop, axis=1, inplace = True)

agg_feat_corr.shape


In [None]:
agg_feat_corr.head()

In [None]:
# correr scaler
from sklearn.preprocessing import StandardScaler

model = agg_feat_corr

names = [x for x in model.columns if (x != 'client_id') & (x != 'Target')]

scaler = StandardScaler(copy=True)
scaler.fit(model[names])
scaled_est = scaler.transform(model[names])
scaled_est = pd.DataFrame(scaled_est, columns=names, index=model.index)

model.drop(names, axis=1, inplace=True)
model = pd.concat((model, scaled_est), axis=1, sort=False)


In [None]:
model.head()

In [None]:
AB_table = pd.merge(AB_table, model[model.columns], on='client_id', how='inner')

AB_table.head()

In [None]:
AB_table.shape

In [None]:
AB_table

In [None]:

#agg_feat_corr.head()

target_column = 'Target'
numerical_cols = [x for x in model.columns if (x != 'client_id') & (x != target_column)]
original_cols = ['client_id', 'Target', 'Client_Age_grp', 'days_last_first', 'days_today_last', 'active_count']

print(numerical_cols)

ABT_model = AB_table[np.append(original_cols, numerical_cols)].copy()

In [None]:
# evaluar mejores variables

# conda install -c conda-forge lightgbm
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
import lightgbm as lgb
from sklearn.metrics import roc_auc_score

fit_params={#"early_stopping_rounds":100,
           "eval_metric": 'auc',
           "eval_set": [(ABT_model[numerical_cols], ABT_model[target_column])]
            #,"verbose": 100
           }
param_test = {'num_leaves': np.arange(5, 20, 1),
             'min_data_in_leaf': np.arange(10, 100, 1),
             'subsample': sp_uniform(loc=0.2, scale=0.8),
             'max_depth': np.arange(5, 20, 1),
             'n_estimators': np.arange(20, 100, 1)
             }
n_HP_points_to_test = 100

import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

clf = lgb.LGBMClassifier(random_state=314, silent=True, metric='None', nfold=5, njobs=4, early_stopping_rounds=100)

gs = RandomizedSearchCV(estimator=clf, param_distributions=param_test,
                       n_iter=n_HP_points_to_test,
                       scoring='roc_auc',
                       cv=3,
                       refit=True,
                       random_state=314,
                       verbose=True,
                       error_score='raise')

gs.fit(ABT_model[numerical_cols], ABT_model[target_column], **fit_params)
#feat_imp = pd.Series(gs.best_estimator_.feature_importances, index=ABT_model[numerical_cols].columns)
feat_imp = pd.Series(gs.best_estimator_.booster_.feature_importance(importance_type='gain'), index=ABT_model[numerical_cols].columns)



In [None]:
feat_imp.nlargest(20).plot(kind='barh', figsize=(8, 10))

In [None]:
feat_imp.nlargest(20)

In [None]:
top_variables = feat_imp.nlargest(20).index.tolist()

In [None]:
# Rename para bivariado
ABT = ABT_model.copy()
ABT.rename(columns={'client_id': 'idx'}, inplace=True)
ABT.rename(columns={'Target': 'TGT'}, inplace=True)
ABT.head()

In [None]:
# Ejemplo de creación de bivariados

%run ./Graficos_v2.py



vars = ['idx', 'TGT', 'CreditCard_Total_Spending_seis_sum']

Graficar_Variables2(ABT[vars], [], 'TGT')

In [None]:
final_variables = np.append(['client_id', 'Target'], top_variables)

# Split in train and Test
from sklearn.model_selection import train_test_split
x_train, x_test = train_test_split(ABT_model[final_variables], test_size=0.3, random_state=420, stratify=ABT_model['Target'])

print(x_train.Target.value_counts())
print(x_test.Target.value_counts())

In [None]:
x_train.head()

In [None]:
# Entrenar

fit_params2={#"early_stopping_rounds":100,
           "eval_metric": 'auc',
           "eval_set": [(x_test[top_variables], x_test[target_column])]
            #,"verbose": 100
           }
param_test2 = {'num_leaves': np.arange(3, 8, 1),
             'min_child_samples': np.arange(300, 1000, 100),
             'learning_rate': [0.05,0.1,0.2,0.01],
             'max_depth': np.arange(4, 10, 1),
             'n_estimators': np.arange(6, 20, 1)
             }
n_HP_points_to_test = 100


clf2 = lgb.LGBMClassifier(random_state=314, silent=True, metric='None', njobs=4, early_stopping_rounds=100)

gs2 = RandomizedSearchCV(estimator=clf2, param_distributions=param_test2,
                       n_iter=n_HP_points_to_test,
                       scoring='roc_auc',
                       cv=3,
                       refit=True,
                       random_state=314,
                       verbose=True,
                       error_score='raise')

gs2.fit(x_train[top_variables], x_train[target_column], **fit_params2)
#feat_imp = pd.Series(gs2.best_estimator_.feature_importances, index=ABT_model[top_variables].columns)
feat_imp2 = pd.Series(gs2.best_estimator_.booster_.feature_importance(importance_type='gain'), index=x_train[top_variables].columns)



In [None]:
feat_imp2.nlargest(20).plot(kind='barh', figsize=(8, 10))

In [None]:
# Scoreo la base completa con el modelo final (mejores hiperparametros)
clf_final_train = gs2


from sklearn.metrics import mean_squared_error

probabilities_train = clf_final_train.predict_proba(x_train[top_variables])
probabilities       = clf_final_train.predict_proba(x_test[top_variables])


# Con esto obtenes los deciles, cantidad de personas, buenas y total, junto a la cota de probabilidad por decil para training

a = pd.DataFrame(x_train[['client_id', target_column]], columns=['Target', 'client_id'])
a = a.reset_index()
b = pd.DataFrame(probabilities_train[:,1], columns=['Prob1'])

result = pd.concat([a, b], axis=1)
result['porc'] = result['Prob1'].rank(pct=True) * 100
len(probabilities[:,1])

result.loc[result['porc'].between(0, 10, inclusive='neither'), 'decil'] = '10'
result.loc[result['porc'].between(10, 20, inclusive='both'), 'decil'] = '9'
result.loc[result['porc'].between(20, 30, inclusive='neither'), 'decil'] = '8'
result.loc[result['porc'].between(30, 40, inclusive='both'), 'decil'] = '7'
result.loc[result['porc'].between(40, 50, inclusive='neither'), 'decil'] = '6'
result.loc[result['porc'].between(50, 60, inclusive='both'), 'decil'] = '5'
result.loc[result['porc'].between(60, 70, inclusive='neither'), 'decil'] = '4'
result.loc[result['porc'].between(70, 80, inclusive='both'), 'decil'] = '3'
result.loc[result['porc'].between(80, 90, inclusive='neither'), 'decil'] = '2'
result.loc[result['porc'].between(90, 101, inclusive='both'), 'decil'] = '1'

print(result.decil.value_counts().reset_index().sort_values(by=['decil'], key=lambda x: x.astype(int)))
print(result[result.Target == 1].decil.value_counts().reset_index().sort_values(by=['decil'], key=lambda x: x.astype(int)))

a = result.groupby('decil')['Prob1'].agg(min)
print(a)


In [None]:


# test 

a = pd.DataFrame(x_test[[target_column, 'client_id']], columns=['Target', 'client_id'])
a = a.reset_index()
b = pd.DataFrame(probabilities[:,1], columns=['Prob1'])

result = pd.concat([a, b], axis=1)

result['porc'] = result['Prob1'].rank(pct=True) * 100

result['decil'] = np.where(result.Prob1 >= 0.636058               , 1, 
                            np.where((result.Prob1 >=  0.495332) & (result.Prob1 <  0.636058       ), 2,
                            np.where((result.Prob1 >=  0.390821) & (result.Prob1 < 0.495332        ) , 3,
                            np.where((result.Prob1 >=  0.314308) & (result.Prob1 < 0.390821        ), 4,
                            np.where((result.Prob1 >=  0.232692) & (result.Prob1 < 0.314308        ), 5,
                            np.where((result.Prob1 >=  0.154669) & (result.Prob1 < 0.232692        ), 6,
                            np.where((result.Prob1 >=  0.035695) & (result.Prob1 < 0.154669        ) , 7,
                            np.where((result.Prob1 >=  0.025761) & (result.Prob1 < 0.035695        ), 8,
                            np.where((result.Prob1 >=  0.025717) & (result.Prob1 < 0.025761        ), 9,
                            10)))))))))

print(result.decil.value_counts().reset_index().sort_values(by=['decil'], key=lambda x: x.astype(int)))

print(result[result.Target == 1].decil.value_counts().reset_index().sort_values(by=['decil'], key=lambda x: x.astype(int)))


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

probabilities_train2 = clf_final_train.predict_proba(x_train[top_variables])[:, 1]
y_train = x_train[target_column].values

fpr_train, tpr_train, thresholds_train = roc_curve(x_train[target_column], probabilities_train2)
roc_auc_train = auc(fpr_train, tpr_train)

# Calculate ROC curve for training set
fpr_train, tpr_train, thresholds_train = roc_curve(y_train, probabilities_train2)
roc_auc_train = auc(fpr_train, tpr_train)

# Plot ROC curve
plt.figure(figsize=(8, 8))
plt.plot(fpr_train, tpr_train, color='darkorange', lw=2, label='Train ROC curve (area = {:.2f})'.format(roc_auc_train))
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()