In [1]:
import pandas as pd
import random

### 1. Generating random data

In [2]:
# Define the categories for age and education
age_categories = ['15-18', '19-20', '21-22', '23-24', '25-50']
education_categories = ['mbo 1-2', 'mbo 3-4', 'hbo', 'wo']
distance_categories = ['0km', '0-1km', '1-2km', '2-5km', '5-10km', '10-20km', '20-50km', '50-500km', 'unknown']
risk_categories = ['1', '2', '3', '4', '5', '6']
house_inspection_categories = ['yes', 'no']
result_house_inspection_categories = ['duly granted', 'unduly granted', 'unknown', 'NULL']
appeal_categories = ['yes', 'no']
result_appeal_categories = ['succesfull', 'unsuccesfull', 'partially succesfull']


# Generate random data
n = 200000
data = {'age': [random.choice(age_categories) for _ in range(n)],
        'education': [random.choice(education_categories) for _ in range(n)],
        'distance': [random.choice(distance_categories) for _ in range(n)],
        'risk_category': [random.choice(risk_categories) for _ in range(n)],
        'house_inspection': [random.choice(house_inspection_categories) for _ in range(n)],
        'result_house_inspection': [random.choice(result_house_inspection_categories) for _ in range(n)],
        'appeal': [random.choice(appeal_categories) for _ in range(n)],
        'result_appeal': [random.choice(result_appeal_categories) for _ in range(n)],}

# Create a DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
df.head(10)

Unnamed: 0,age,education,distance,risk_category,house_inspection,result_house_inspection,appeal,result_appeal
0,15-18,mbo 1-2,50-500km,3,no,duly granted,yes,succesfull
1,15-18,mbo 1-2,unknown,2,no,unknown,yes,succesfull
2,15-18,mbo 3-4,1-2km,4,no,unknown,yes,succesfull
3,23-24,mbo 3-4,50-500km,5,no,unduly granted,no,unsuccesfull
4,23-24,wo,2-5km,6,no,unknown,no,unsuccesfull
5,15-18,wo,0km,4,yes,unduly granted,no,unsuccesfull
6,19-20,mbo 1-2,0-1km,1,yes,duly granted,no,partially succesfull
7,19-20,mbo 3-4,50-500km,3,yes,,yes,succesfull
8,23-24,wo,0-1km,1,no,unknown,no,succesfull
9,19-20,mbo 3-4,10-20km,4,yes,unknown,yes,succesfull


In [3]:
df.shape

(200000, 8)

In [4]:
df['result_house_inspection'].value_counts()

result_house_inspection
unknown           50228
duly granted      50040
NULL              49885
unduly granted    49847
Name: count, dtype: int64

### 2. Data processing

#### Create predictions

In [5]:
# remove unknown and NULL house inspections
condition1 = df['result_house_inspection'] == "unduly granted" 
condition2 = df['result_house_inspection'] == "duly granted"
filtered_df = df[(condition1 | condition2)]
filtered_df.shape

# predictions
map_dict = {6:1, 5:1, 4:0, 3:0, 2:0, 1:0}
filtered_df['pred'] = filtered_df['risk_category'].astype(int).map(map_dict)

# ground truth
map_dict = {"unduly granted":1, "duly granted":0}
filtered_df['ground_truth'] = filtered_df['result_house_inspection'].map(map_dict)

filtered_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['pred'] = filtered_df['risk_category'].astype(int).map(map_dict)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['ground_truth'] = filtered_df['result_house_inspection'].map(map_dict)


Unnamed: 0,age,education,distance,risk_category,house_inspection,result_house_inspection,appeal,result_appeal,pred,ground_truth
0,15-18,mbo 1-2,50-500km,3,no,duly granted,yes,succesfull,0,0
3,23-24,mbo 3-4,50-500km,5,no,unduly granted,no,unsuccesfull,1,1
5,15-18,wo,0km,4,yes,unduly granted,no,unsuccesfull,0,1
6,19-20,mbo 1-2,0-1km,1,yes,duly granted,no,partially succesfull,0,0
10,19-20,mbo 1-2,0-1km,2,no,duly granted,no,partially succesfull,0,0


#### Dummy variables

In [6]:
# create dummy variabels
dummy_df = pd.get_dummies(filtered_df, prefix=['age','education','distance'], columns=['age','education','distance'], dtype=float)

# drop old categorical variabels
dummy_df = dummy_df.drop(['risk_category', 'house_inspection', 'result_house_inspection', 'appeal', 'result_appeal', 'risk_category', 'house_inspection', 'result_house_inspection'], axis=1)

# change column names
dummy_df = dummy_df.rename(columns={'pred': 'predicted_class', 'ground_truth': 'true_class'})
dummy_df.head()

Unnamed: 0,predicted_class,true_class,age_15-18,age_19-20,age_21-22,age_23-24,age_25-50,education_hbo,education_mbo 1-2,education_mbo 3-4,education_wo,distance_0-1km,distance_0km,distance_1-2km,distance_10-20km,distance_2-5km,distance_20-50km,distance_5-10km,distance_50-500km,distance_unknown
0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5,0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0,0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0,0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
dummy_df.to_csv('CLC_dataset.csv')