## Data Preprocessing

In [1]:
import pandas as pd
import numpy as np

In [2]:
Test=pd.read_csv('test.csv')

In [3]:
Test

Unnamed: 0,patient_id,patient_race,payer_type,patient_state,patient_zip3,Region,Division,patient_age,patient_gender,bmi,...,Average of Mar-18,Average of Apr-18,Average of May-18,Average of Jun-18,Average of Jul-18,Average of Aug-18,Average of Sep-18,Average of Oct-18,Average of Nov-18,Average of Dec-18
0,730681,,COMMERCIAL,LA,713,South,West South Central,55,F,,...,62.21,62.23,78.34,81.96,83.58,82.22,80.20,69.73,53.14,51.34
1,334212,Black,,NC,283,South,South Atlantic,60,F,40.00,...,48.63,58.14,77.26,80.05,82.88,82.09,78.85,64.60,50.57,48.10
2,571362,,COMMERCIAL,TX,794,South,West South Central,54,F,32.33,...,57.82,59.95,77.79,82.45,82.44,80.77,72.16,59.31,48.25,42.13
3,907331,,COMMERCIAL,TN,373,South,East South Central,63,F,27.07,...,47.57,53.50,71.31,75.20,76.96,75.78,74.87,61.06,44.31,42.83
4,208382,Asian,,WA,980,West,Pacific,62,F,,...,41.02,46.25,56.92,57.88,66.16,65.21,57.52,49.53,43.75,38.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5641,649011,White,,CO,800,West,Mountain,63,F,,...,43.24,48.85,61.47,72.59,74.94,72.44,67.41,49.12,38.81,33.29
5642,869024,,COMMERCIAL,GA,300,South,South Atlantic,57,F,40.00,...,50.20,56.76,72.53,77.56,78.82,77.39,78.71,64.14,47.25,44.86
5643,304800,Asian,COMMERCIAL,CA,914,West,Pacific,44,F,,...,59.25,62.98,63.26,67.57,76.04,75.15,71.05,68.75,65.27,59.42
5644,267406,White,MEDICARE ADVANTAGE,CA,906,West,Pacific,66,F,31.79,...,60.26,64.84,64.40,69.37,78.29,77.42,73.15,70.26,65.92,59.39


In [4]:
## Droping unnessasary columns

Test.drop(['metastatic_first_novel_treatment', 'metastatic_first_novel_treatment_type'], axis=1, inplace=True)

### Filling Null Values

In [5]:
#bmi 

# Step 1: Group data by ZIP code and age group
age_groups = pd.cut(Test['patient_age'], bins=range(0, 120, 10), right=False)
grouped = Test.groupby(['patient_zip3', age_groups])

# Step 2: Calculate average BMI for valid groups
avg_bmi_by_group = {}
for group, group_data in grouped:
    non_null_bmi = group_data['bmi'].dropna()
    if not non_null_bmi.empty and non_null_bmi.count() > 1:  # Ensure there are enough non-null values
        avg_bmi_by_group[group] = non_null_bmi.mean()
    else:
        avg_bmi_by_group[group] = np.nan  # Set NaN if there are not enough non-null values

# Step 3: Replace null BMI values with corresponding averages
for index, row in Test.iterrows():
    if pd.isnull(row['bmi']):
        zip_code = row['patient_zip3']
        age_group = pd.cut([row['patient_age']], bins=range(0, 120, 10), right=False)[0]
        if (zip_code, age_group) in avg_bmi_by_group and not pd.isnull(avg_bmi_by_group[(zip_code, age_group)]):
            Test.at[index, 'bmi'] = avg_bmi_by_group[(zip_code, age_group)]
        else:
            # Replace NaN BMI values with a default value, or you can choose to skip them
            Test.at[index, 'bmi'] = 29.168808   # Replace with a default value


In [6]:
#patient_race

# Define a function to determine patient_race based on race percentages
def determine_patient_race(row):
    # Check if patient_race is null
    if pd.isnull(row['patient_race']):
        # Check race percentages
        if row['race_black'] > 30:
            return 'Black'
        elif row['race_asian'] > 25:
            return 'Asian'
        elif row['race_other'] > 30:
            return 'Other'
        elif row['race_white'] > 80:
            return 'White'
        else:
            return 'White'  # Default to White if none of the conditions are met
    else:
        return row['patient_race']  # Return existing patient_race if not null

# Apply the function to each row where patient_race is null
Test['patient_race'] = Test.apply(lambda row: determine_patient_race(row), axis=1)

In [7]:
#payer_type

# Define a function to determine payer_type based on conditions
def determine_payer_type(row):
    if pd.isnull(row['payer_type']):
        if row['patient_age'] >= 65:
            return 'MEDICARE ADVANTAGE'
        elif row['income_individual_median'] < 31255:
            return 'MEDICAID'
        elif row['income_individual_median'] > 36000 and 45 < row['patient_age'] < 65:
            return 'COMMERCIAL'
        else:
            return 'COMMERCIAL'
    else:
        return row['payer_type']  # Return existing payer_type if not null

# Apply the function to each row where payer_type is null
Test['payer_type'] = Test.apply(lambda row: determine_payer_type(row), axis=1)

In [8]:
#to fill the remaining columns

columns_to_fill = [
    'family_size', 'family_dual_income', 'income_household_median', 'income_household_under_5',
    'income_household_5_to_10', 'income_household_10_to_15', 'income_household_15_to_20',
    'income_household_20_to_25', 'income_household_25_to_35', 'income_household_35_to_50',
    'income_household_50_to_75', 'income_household_75_to_100', 'income_household_100_to_150',
    'income_household_150_over', 'income_household_six_figure', 'home_ownership', 'home_value',
    'rent_median', 'rent_burden', 'poverty', 'limited_english', 'Average of Jan-13', 'Average of Feb-13',
    'Average of May-13', 'Average of Jun-13', 'Average of Aug-13', 'Average of Sep-13', 'Average of Oct-13',
    'Average of Nov-13', 'Average of Dec-13', 'Average of Jan-14', 'Average of Feb-14', 'Average of Mar-14',
    'Average of Apr-14', 'Average of May-14', 'Average of Jun-14', 'Average of Nov-14', 'Average of Jan-15',
    'Average of Feb-15', 'Average of Mar-15', 'Average of Apr-15', 'Average of Aug-15', 'Average of Oct-15',
    'Average of Nov-15', 'Average of Dec-15', 'Average of Jan-16', 'Average of Feb-16', 'Average of May-16',
    'Average of Jul-16', 'Average of Nov-16', 'Average of Dec-16', 'Average of Jan-17', 'Average of Jun-17',
    'Average of Jul-17', 'Average of Sep-17', 'Average of Oct-17', 'Average of Nov-17', 'Average of Feb-18',
    'Average of Mar-18', 'Average of Jun-18', 'Average of Jul-18', 'Average of Aug-18', 'Average of Sep-18',
    'Average of Oct-18', 'Average of Nov-18', 'Average of Dec-18'
]

# Replace only the null values in the specified columns with the mean of each column
for column in columns_to_fill:
    if Test[column].isnull().any():  # Check if the column has any null values
        Test[column].fillna(Test[column].mean(), inplace=True)


In [9]:
Test

Unnamed: 0,patient_id,patient_race,payer_type,patient_state,patient_zip3,Region,Division,patient_age,patient_gender,bmi,...,Average of Mar-18,Average of Apr-18,Average of May-18,Average of Jun-18,Average of Jul-18,Average of Aug-18,Average of Sep-18,Average of Oct-18,Average of Nov-18,Average of Dec-18
0,730681,White,COMMERCIAL,LA,713,South,West South Central,55,F,29.168808,...,62.21,62.23,78.34,81.96,83.58,82.22,80.20,69.73,53.14,51.34
1,334212,Black,MEDICAID,NC,283,South,South Atlantic,60,F,40.000000,...,48.63,58.14,77.26,80.05,82.88,82.09,78.85,64.60,50.57,48.10
2,571362,White,COMMERCIAL,TX,794,South,West South Central,54,F,32.330000,...,57.82,59.95,77.79,82.45,82.44,80.77,72.16,59.31,48.25,42.13
3,907331,White,COMMERCIAL,TN,373,South,East South Central,63,F,27.070000,...,47.57,53.50,71.31,75.20,76.96,75.78,74.87,61.06,44.31,42.83
4,208382,Asian,COMMERCIAL,WA,980,West,Pacific,62,F,27.190000,...,41.02,46.25,56.92,57.88,66.16,65.21,57.52,49.53,43.75,38.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5641,649011,White,COMMERCIAL,CO,800,West,Mountain,63,F,29.168808,...,43.24,48.85,61.47,72.59,74.94,72.44,67.41,49.12,38.81,33.29
5642,869024,Black,COMMERCIAL,GA,300,South,South Atlantic,57,F,40.000000,...,50.20,56.76,72.53,77.56,78.82,77.39,78.71,64.14,47.25,44.86
5643,304800,Asian,COMMERCIAL,CA,914,West,Pacific,44,F,25.955000,...,59.25,62.98,63.26,67.57,76.04,75.15,71.05,68.75,65.27,59.42
5644,267406,White,MEDICARE ADVANTAGE,CA,906,West,Pacific,66,F,31.790000,...,60.26,64.84,64.40,69.37,78.29,77.42,73.15,70.26,65.92,59.39


### Converting categorical to Numerical

In [10]:
#patirnt_race

import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Perform label encoding
Test['patient_race'] = label_encoder.fit_transform(Test['patient_race'])



In [11]:
#payer_type

# Mapping dictionary for replacement
payer_type_mapping = {'COMMERCIAL': 1, 'MEDICAID': 2, 'MEDICARE ADVANTAGE': 3}

# Replace values in the payer_type column
Test['payer_type'] = Test['payer_type'].replace(payer_type_mapping)

In [12]:
#patient_gender

# Mapping dictionary for replacement
gender_mapping = {'M': 0, 'F': 1}

# Replace values in the patient_gender column
Test['patient_gender'] = Test['patient_gender'].replace(gender_mapping)

In [13]:
#Region

# Mapping dictionary for replacement
Region_mapping = {'South': 1, 'Midwest': 2, 'West': 3, 'Northeast': 4}

# Replace values in the patient_gender column
Test['Region'] = Test['Region'].replace(Region_mapping)

In [14]:
#Division

# Mapping dictionary for replacement
Division_mapping = {'West South Central': 1, 'East North Central': 2, 'Pacific': 3, 'South Atlantic': 4, 'East South Central':5, 'Middle Atlantic':6, 'West North Central':7, 'Mountain':8}

# RTesteplace values in the patient_gender column
Test['Division'] = Test['Division'].replace(Division_mapping)

In [15]:
Test

Unnamed: 0,patient_id,patient_race,payer_type,patient_state,patient_zip3,Region,Division,patient_age,patient_gender,bmi,...,Average of Mar-18,Average of Apr-18,Average of May-18,Average of Jun-18,Average of Jul-18,Average of Aug-18,Average of Sep-18,Average of Oct-18,Average of Nov-18,Average of Dec-18
0,730681,4,1,LA,713,1,1,55,1,29.168808,...,62.21,62.23,78.34,81.96,83.58,82.22,80.20,69.73,53.14,51.34
1,334212,1,2,NC,283,1,4,60,1,40.000000,...,48.63,58.14,77.26,80.05,82.88,82.09,78.85,64.60,50.57,48.10
2,571362,4,1,TX,794,1,1,54,1,32.330000,...,57.82,59.95,77.79,82.45,82.44,80.77,72.16,59.31,48.25,42.13
3,907331,4,1,TN,373,1,5,63,1,27.070000,...,47.57,53.50,71.31,75.20,76.96,75.78,74.87,61.06,44.31,42.83
4,208382,0,1,WA,980,3,3,62,1,27.190000,...,41.02,46.25,56.92,57.88,66.16,65.21,57.52,49.53,43.75,38.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5641,649011,4,1,CO,800,3,8,63,1,29.168808,...,43.24,48.85,61.47,72.59,74.94,72.44,67.41,49.12,38.81,33.29
5642,869024,1,1,GA,300,1,4,57,1,40.000000,...,50.20,56.76,72.53,77.56,78.82,77.39,78.71,64.14,47.25,44.86
5643,304800,0,1,CA,914,3,3,44,1,25.955000,...,59.25,62.98,63.26,67.57,76.04,75.15,71.05,68.75,65.27,59.42
5644,267406,4,3,CA,906,3,3,66,1,31.790000,...,60.26,64.84,64.40,69.37,78.29,77.42,73.15,70.26,65.92,59.39


In [16]:
# zip_state_mapping = {630: 'MO', 864: 'AZ'}
# Test['patient_state'] = Test['patient_state'].fillna(Test['patient_zip3'].map(zip_state_mapping))


In [17]:
# male_to_female_mapping = {
#     'C50122': 'C50112',
#     'C50221': 'C50211',
#     'C50421': 'C50411',
#     'C509': 'C5091',
#     'C50922': 'C50912'
# }
# Test['breast_cancer_diagnosis_code'] = Test['breast_cancer_diagnosis_code'].replace(male_to_female_mapping)


In [18]:
def replace_outliers_iqr(df, columns):
    for column in columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_limit = Q1 - 1.5 * IQR
        upper_limit = Q3 + 1.5 * IQR
        # Replace outliers with upper or lower limit
        df[column] = df[column].apply(lambda x: upper_limit if x > upper_limit else lower_limit if x < lower_limit else x)

# Assuming 'train' is your DataFrame
columns_with_outliers = [
    'patient_age', 'bmi', 'density', 'age_median', 'age_under_10', 'age_10_to_19',
    'age_20s', 'age_30s', 'age_40s', 'age_50s', 'age_60s', 'age_70s', 'age_over_80',
    'male', 'female', 'married', 'divorced', 'widowed', 'family_size', 'family_dual_income',
    'income_household_median', 'income_household_under_5', 'income_household_5_to_10',
    'income_household_10_to_15', 'income_household_15_to_20', 'income_household_20_to_25',
    'income_household_25_to_35', 'income_household_35_to_50', 'income_household_50_to_75',
    'income_household_75_to_100', 'income_household_100_to_150', 'income_household_150_over',
    'income_household_six_figure', 'income_individual_median', 'home_ownership', 'home_value',
    'rent_median', 'rent_burden', 'education_less_highschool', 'education_highschool',
    'education_some_college', 'labor_force_participation', 'unemployment_rate', 'self_employed',
    'farmer', 'race_black', 'race_asian', 'race_native', 'race_pacific', 'race_other',
    'race_multiple', 'hispanic', 'disabled', 'poverty', 'limited_english', 'commute_time',
    'health_uninsured', 'veteran'
]

replace_outliers_iqr(Test, columns_with_outliers)

In [19]:
#Train data set
import pandas as pd
train=pd.read_csv('train.csv')

In [20]:
columns_to_fill = [
    'family_size', 'family_dual_income', 'income_household_median', 'income_household_under_5',
    'income_household_5_to_10', 'income_household_10_to_15', 'income_household_15_to_20',
    'income_household_20_to_25', 'income_household_25_to_35', 'income_household_35_to_50',
    'income_household_50_to_75', 'income_household_75_to_100', 'income_household_100_to_150',
    'income_household_150_over', 'income_household_six_figure', 'home_ownership', 'home_value',
    'rent_median', 'rent_burden', 'poverty', 'limited_english', 'Average of Jan-13', 'Average of Feb-13',
    'Average of May-13', 'Average of Jun-13', 'Average of Aug-13', 'Average of Sep-13', 'Average of Oct-13',
    'Average of Nov-13', 'Average of Dec-13', 'Average of Jan-14', 'Average of Feb-14', 'Average of Mar-14',
    'Average of Apr-14', 'Average of May-14', 'Average of Jun-14', 'Average of Nov-14', 'Average of Jan-15',
    'Average of Feb-15', 'Average of Mar-15', 'Average of Apr-15', 'Average of Aug-15', 'Average of Oct-15',
    'Average of Nov-15', 'Average of Dec-15', 'Average of Jan-16', 'Average of Feb-16', 'Average of May-16',
    'Average of Jul-16', 'Average of Nov-16', 'Average of Dec-16', 'Average of Jan-17', 'Average of Jun-17',
    'Average of Jul-17', 'Average of Sep-17', 'Average of Oct-17', 'Average of Nov-17', 'Average of Feb-18',
    'Average of Mar-18', 'Average of Jun-18', 'Average of Jul-18', 'Average of Aug-18', 'Average of Sep-18',
    'Average of Oct-18', 'Average of Nov-18', 'Average of Dec-18'
]

# Replace only the null values in the specified columns with the mean of each column
for column in columns_to_fill:
    if train[column].isnull().any():  # Check if the column has any null values
        train[column].fillna(train[column].mean(), inplace=True)


In [21]:
# Define a function to determine payer_type based on conditions
def determine_payer_type(row):
    if pd.isnull(row['payer_type']):
        if row['patient_age'] >= 65:
            return 'MEDICARE ADVANTAGE'
        elif row['income_individual_median'] < 31255:
            return 'MEDICAID'
        elif row['income_individual_median'] > 36000 and 45 < row['patient_age'] < 65:
            return 'COMMERCIAL'
        else:
            return 'COMMERCIAL'
    else:
        return row['payer_type']  # Return existing payer_type if not null

# Apply the function to each row where payer_type is null
train['payer_type'] = train.apply(lambda row: determine_payer_type(row), axis=1)


In [22]:
import numpy as np

# Define a function to determine patient_race based on race percentages
def determine_patient_race(row):
    # Check if patient_race is null
    if pd.isnull(row['patient_race']):
        # Check race percentages
        if row['race_black'] > 30:
            return 'Black'
        elif row['race_asian'] > 25:
            return 'Asian'
        elif row['race_other'] > 30:
            return 'Other'
        elif row['race_white'] > 80:
            return 'White'
        else:
            return 'White'  # Default to White if none of the conditions are met
    else:
        return row['patient_race']  # Return existing patient_race if not null

# Apply the function to each row where patient_race is null
train['patient_race'] = train.apply(lambda row: determine_patient_race(row), axis=1)


In [23]:
import pandas as pd
import numpy as np

# Assuming your dataset is named 'train'

# Step 1: Group data by ZIP code and age group
age_groups = pd.cut(train['patient_age'], bins=range(0, 120, 10), right=False)
grouped = train.groupby(['patient_zip3', age_groups])

# Step 2: Calculate average BMI for valid groups
avg_bmi_by_group = {}
for group, group_data in grouped:
    non_null_bmi = group_data['bmi'].dropna()
    if not non_null_bmi.empty and non_null_bmi.count() > 1:  # Ensure there are enough non-null values
        avg_bmi_by_group[group] = non_null_bmi.mean()
    else:
        avg_bmi_by_group[group] = np.nan  # Set NaN if there are not enough non-null values

# Step 3: Replace null BMI values with corresponding averages
for index, row in train.iterrows():
    if pd.isnull(row['bmi']):
        zip_code = row['patient_zip3']
        age_group = pd.cut([row['patient_age']], bins=range(0, 120, 10), right=False)[0]
        if (zip_code, age_group) in avg_bmi_by_group and not pd.isnull(avg_bmi_by_group[(zip_code, age_group)]):
            train.at[index, 'bmi'] = avg_bmi_by_group[(zip_code, age_group)]
        else:
            # Replace NaN BMI values with a default value, or you can choose to skip them
            train.at[index, 'bmi'] = 29.168808   # Replace with a default value

# Now, train dataset should have null BMI values replaced with corresponding averages or default values


In [24]:
train.drop(['metastatic_first_novel_treatment', 'metastatic_first_novel_treatment_type'], axis=1, inplace=True)

In [25]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Perform label encoding
train['patient_race'] = label_encoder.fit_transform(train['patient_race'])

print("Updated train DataFrame:\n", train)

Updated train DataFrame:
        patient_id  patient_race  payer_type patient_state  patient_zip3  \
0          268700             4  COMMERCIAL            AR           724   
1          484983             4    MEDICAID            IL           629   
2          277055             4  COMMERCIAL            CA           925   
3          320055             2    MEDICAID            CA           900   
4          190386             4  COMMERCIAL            CA           934   
...           ...           ...         ...           ...           ...   
13168      588544             2    MEDICAID            PA           191   
13169      393047             4  COMMERCIAL            TX           757   
13170      790904             4  COMMERCIAL            CA           928   
13171      455518             4  COMMERCIAL            MI           481   
13172      379418             2    MEDICAID            CA           900   

          Region            Division  patient_age patient_gender        b

In [26]:



# Mapping dictionary for replacement
payer_type_mapping = {'COMMERCIAL': 1, 'MEDICAID': 2, 'MEDICARE ADVANTAGE': 3}

# Replace values in the payer_type column
train['payer_type'] = train['payer_type'].replace(payer_type_mapping)

print("Updated train DataFrame:\n", train)

Updated train DataFrame:
        patient_id  patient_race  payer_type patient_state  patient_zip3  \
0          268700             4           1            AR           724   
1          484983             4           2            IL           629   
2          277055             4           1            CA           925   
3          320055             2           2            CA           900   
4          190386             4           1            CA           934   
...           ...           ...         ...           ...           ...   
13168      588544             2           2            PA           191   
13169      393047             4           1            TX           757   
13170      790904             4           1            CA           928   
13171      455518             4           1            MI           481   
13172      379418             2           2            CA           900   

          Region            Division  patient_age patient_gender        b

In [27]:
import pandas as pd



# Mapping dictionary for replacement
gender_mapping = {'M': 0, 'F': 1}

# Replace values in the patient_gender column
train['patient_gender'] = train['patient_gender'].replace(gender_mapping)

print("Updated train DataFrame:\n", train)

Updated train DataFrame:
        patient_id  patient_race  payer_type patient_state  patient_zip3  \
0          268700             4           1            AR           724   
1          484983             4           2            IL           629   
2          277055             4           1            CA           925   
3          320055             2           2            CA           900   
4          190386             4           1            CA           934   
...           ...           ...         ...           ...           ...   
13168      588544             2           2            PA           191   
13169      393047             4           1            TX           757   
13170      790904             4           1            CA           928   
13171      455518             4           1            MI           481   
13172      379418             2           2            CA           900   

          Region            Division  patient_age  patient_gender        

In [28]:

# Mapping dictionary for replacement
Region_mapping = {'South': 1, 'Midwest': 2, 'West': 3, 'Northeast': 4}

# Replace values in the patient_gender column
train['Region'] = train['Region'].replace(Region_mapping)

print("Updated train DataFrame:\n", train)

Updated train DataFrame:
        patient_id  patient_race  payer_type patient_state  patient_zip3  \
0          268700             4           1            AR           724   
1          484983             4           2            IL           629   
2          277055             4           1            CA           925   
3          320055             2           2            CA           900   
4          190386             4           1            CA           934   
...           ...           ...         ...           ...           ...   
13168      588544             2           2            PA           191   
13169      393047             4           1            TX           757   
13170      790904             4           1            CA           928   
13171      455518             4           1            MI           481   
13172      379418             2           2            CA           900   

       Region            Division  patient_age  patient_gender        bmi

In [29]:
import pandas as pd



# Mapping dictionary for replacement
Division_mapping = {'West South Central': 1, 'East North Central': 2, 'Pacific': 3, 'South Atlantic': 4, 'East South Central':5, 'Middle Atlantic':6, 'West North Central':7, 'Mountain':8}

# Replace values in the patient_gender column
train['Division'] = train['Division'].replace(Division_mapping)

print("Updated train DataFrame:\n", train)


Updated train DataFrame:
        patient_id  patient_race  payer_type patient_state  patient_zip3  \
0          268700             4           1            AR           724   
1          484983             4           2            IL           629   
2          277055             4           1            CA           925   
3          320055             2           2            CA           900   
4          190386             4           1            CA           934   
...           ...           ...         ...           ...           ...   
13168      588544             2           2            PA           191   
13169      393047             4           1            TX           757   
13170      790904             4           1            CA           928   
13171      455518             4           1            MI           481   
13172      379418             2           2            CA           900   

       Region  Division  patient_age  patient_gender        bmi  ...  \
0

In [30]:
# import pandas as pd
# from sklearn.ensemble import HistGradientBoostingRegressor
# from sklearn.preprocessing import OneHotEncoder
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# from sklearn.metrics import mean_squared_error
# from sklearn.model_selection import RandomizedSearchCV
# from scipy.stats import uniform, randint
# import numpy as np

# # Drop 'Unnamed: 0' if it exists
# if 'Unnamed: 0' in train.columns:
#     train.drop(columns=['Unnamed: 0'], inplace=True)

# # Define features and target variable
# X_train = train.drop(columns=["metastatic_diagnosis_period"])
# y_train = train["metastatic_diagnosis_period"]

# # Define preprocessing steps
# categorical_cols = X_train.select_dtypes(include=["object"]).columns
# categorical_transformer = Pipeline(steps=[
#     ('onehot', OneHotEncoder(handle_unknown='ignore'))
# ])

# numerical_cols = X_train.select_dtypes(include=["int64", "float64"]).columns
# if 'Unnamed: 0' in numerical_cols:
#     numerical_cols = numerical_cols.drop('Unnamed: 0')

# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', 'passthrough', numerical_cols),
#         ('cat', categorical_transformer, categorical_cols)
#     ])

# # Define the model
# model = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('regressor', HistGradientBoostingRegressor())
# ])

# # Define hyperparameters for random search
# param_dist = {
#     'regressor__max_depth': randint(3, 20),
#     'regressor__learning_rate': uniform(0.01, 0.5),
#     'regressor__max_iter': randint(50, 500),
#     'regressor__min_samples_leaf': randint(1, 20)
# }

# # Perform random search
# random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=50, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
# random_search.fit(X_train, y_train)

# # Get the best model
# best_model = random_search.best_estimator_

# # Evaluate the best model on the training data
# train_predictions = best_model.predict(X_train)
# train_mse = mean_squared_error(y_train, train_predictions)
# train_rmse = np.sqrt(train_mse)
# print(f"Least training MSE: {train_mse}")
# print(f"Corresponding RMSE: {train_rmse}")

# # Drop 'Unnamed: 0' in test data if it exists
# if 'Unnamed: 0' in Test.columns:
#     Test.drop(columns=['Unnamed: 0'], inplace=True)

# # Make predictions using the best model
# predictions = best_model.predict(Test)

# # Save predictions to a CSV file
# solution_df = pd.DataFrame({'patient_id': Test['patient_id'], 'metastatic_diagnosis_period': predictions})
# solution_df.to_csv('solution.csv', index=False)


In [31]:
# import pandas as pd

# df = pd.read_csv('solution.csv')

# df['patient_id'] = df['patient_id'].astype(int)

# df.to_csv('solution.csv', index=False)

In [None]:
import pandas as pd
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import uniform, randint
import numpy as np

# Drop 'Unnamed: 0' if it exists
if 'Unnamed: 0' in train.columns:
    train.drop(columns=['Unnamed: 0'], inplace=True)

# Split the train set into training and validation sets
X = train.drop(columns=["metastatic_diagnosis_period"])
y = train["metastatic_diagnosis_period"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing steps
categorical_cols = X_train.select_dtypes(include=["object"]).columns
numerical_cols = X_train.select_dtypes(include=["int64", "float64"]).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Define the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', HistGradientBoostingRegressor(random_state=42))
])

# Define hyperparameters for random search
param_dist = {
    'regressor__max_depth': randint(3, 20),
    'regressor__learning_rate': uniform(0.01, 0.5),
    'regressor__max_iter': randint(50, 500),
    'regressor__min_samples_leaf': randint(1, 20)
}

# Perform random search
random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=100, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)

# Get the best model
best_model = random_search.best_estimator_

# Evaluate the best model on the validation data
val_predictions = best_model.predict(X_val)
val_mse = mean_squared_error(y_val, val_predictions)
val_rmse = np.sqrt(val_mse)
print(f"Validation MSE: {val_mse}")
print(f"Validation RMSE: {val_rmse}")

# Drop 'Unnamed: 0' in test data if it exists
if 'Unnamed: 0' in Test.columns:
    Test.drop(columns=['Unnamed: 0'], inplace=True)

# Make predictions using the best model
predictions = best_model.predict(Test)

# Save predictions to a CSV file
solution_df = pd.DataFrame({'patient_id': Test['patient_id'], 'metastatic_diagnosis_period': predictions})
solution_df.to_csv('solution1.csv', index=False)


Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [8]:
import pandas as pd

# Example DataFrame
data = {
    'color': ['red', 'green', 'blue', 'green', 'red', 'blue'],
    'size': ['S', 'M', 'L', 'M', 'S', 'L'],
    'price': [10, 20, 30, 20, 10, 30]
}
df = pd.DataFrame(data)

# Apply one-hot encoding using pd.get_dummies
df_encoded = pd.get_dummies(df, columns=['color', 'size'])

# Ensure binary columns are 0 and 1 instead of False and True
df_encoded = df_encoded.astype(int)

# Display the resulting DataFrame
print(df_encoded)


   price  color_blue  color_green  color_red  size_L  size_M  size_S
0     10           0            0          1       0       0       1
1     20           0            1          0       0       1       0
2     30           1            0          0       1       0       0
3     20           0            1          0       0       1       0
4     10           0            0          1       0       0       1
5     30           1            0          0       1       0       0


In [6]:
df_coded

Unnamed: 0,size,price,color_blue,color_green,color_red
0,S,10,False,False,True
1,M,20,False,True,False
2,L,30,True,False,False
3,M,20,False,True,False
4,S,10,False,False,True
5,L,30,True,False,False
