In [1]:
import numpy as np
import pandas as pd

# Multiple Linear Regression with clinical data (CHUM)
- y = overall survival (os_days) in days
- x = sex, age, weight, height, smoking_habit

**TODO:**
- [x] smoking_habit to 1-current, 0-never, 0.5-former
- [x] Create x and y
- [x] apply regression



In [2]:
clinical_chum = pd.read_csv("../../Data/clinical_CHUM.csv")
#clinical_iucpq = pd.read_csv("../../Data/clinical_IUCPQ.csv")
clinical_chum.drop(columns="Unnamed: 0", inplace=True)
#clinical_iucpq.drop(columns="Unnamed: 0", inplace=True)

In [4]:
x_pre = clinical_chum[["sex", "age", "weight", "height", "smoking_habit"]]
y = clinical_chum["os_days"]

In [5]:
# encode sex to numbers
conditions_sex = [(x_pre['sex'] == "Female"),
                    (x_pre['sex'] == "Male")]
values_sex = [0, 1]
x_pre['sex_binary'] = np.select(conditions_sex, values_sex)

# encode smoking habit
conditions_smoking = [(x_pre['smoking_habit'] == "Current"),
                        (x_pre['smoking_habit'] == "Former"),  
                        (x_pre['smoking_habit'] == "Never"),  
                        (x_pre['smoking_habit'] == "nan")]
values_smoking = [1, 0.5, 0, np.NAN]
x_pre['smoking_habit_encoded'] = np.select(conditions_sex, values_sex)

# drop useless columns
x_pre.drop(columns=["sex", "smoking_habit"], inplace= True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_pre['sex_binary'] = np.select(conditions_sex, values_sex)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_pre['smoking_habit_encoded'] = np.select(conditions_sex, values_sex)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_pre.drop(columns=["sex", "smoking_habit"], inplace= True)


In [6]:
x_pre.describe()

Unnamed: 0,age,weight,height,sex_binary,smoking_habit_encoded
count,309.0,309.0,309.0,309.0,309.0
mean,66.469715,69.590939,165.899676,0.482201,0.482201
std,8.79574,16.815582,9.597735,0.500494,0.500494
min,39.483,31.0,144.0,0.0,0.0
25%,61.979,57.6,158.0,0.0,0.0
50%,67.362,67.3,165.0,0.0,0.0
75%,72.225,80.0,174.0,1.0,1.0
max,88.999,165.0,190.0,1.0,1.0


# Multiple Linear Regression

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

def mult_lin_reg(x_df, y):
    x_train, x_test, y_train, y_test = train_test_split(x_df, y, test_size = 0.3, random_state = 100)

    #Fitting the Multiple Linear Regression model
    mlr = LinearRegression()  
    mlr.fit(x_train, y_train)
    print("Intercept: ", mlr.intercept_)
    print("Coefficients:")
    list(zip(x_df, mlr.coef_))

    #Prediction of test set
    y_pred_mlr = mlr.predict(x_test)
    #Actual value and the predicted value
    mlr_diff = pd.DataFrame({'Actual value': y_test, 'Predicted value': y_pred_mlr})
    mlr_diff.head()

    # Errors
    meanAbErr = metrics.mean_absolute_error(y_test, y_pred_mlr)
    meanSqErr = metrics.mean_squared_error(y_test, y_pred_mlr)
    rootMeanSqErr = np.sqrt(metrics.mean_squared_error(y_test, y_pred_mlr))
    print('R squared: {:.2f}'.format(mlr.score(x_df,y)*100))
    print('Mean Absolute Error:', meanAbErr)
    print('Mean Square Error:', meanSqErr)
    print('Root Mean Square Error:', rootMeanSqErr)


# One-Hot Encode Categorical data

In [11]:
categorical_cols = ["sex","smoking_habit", "smoking_first_io", "ecog_status", "histology_group", "stage_patho_dx", "stage_group_patho_dx","stage_1st_io", "stage_group_1st_io", "biospecimen", "biosite", "alk_status", "braf_status", "egfr_status", "kras_status", "met_status", "nras_status", "pdl1_tps", "pdl1_tps_num", "pdl1_group", "ntrk_status", "ret_status", "ros1_status", "tp53_status", "pre_chemo", "first_line_io", "io_type"]

# Which columns do I want to use?
# sex and smoking habit
clinical_chum_one_hot = pd.get_dummies(clinical_chum, columns=["sex", "smoking_habit"], prefix=["sex", "smoke"])
clinical_chum_one_hot.columns

Index(['oncotech_id', 'center', 'dob', 'age', 'weight', 'height', 'bmi',
       'smoking_1st_io', 'date_patho_dx', 'ecog_status', 'histology_group',
       'stage_patho_dx', 'stage_group_patho_dx', 'stage_1st_io',
       'stage_group_1st_io', 'biospecimen', 'biosite', 'alk_status',
       'braf_status', 'egfr_status', 'kras_status', 'met_status',
       'nras_status', 'pdl1_tps', 'pdl1_tps_num', 'pdl1_group', 'ntrk_status',
       'ret_status', 'ros1_status', 'tp53_status', 'pre_radiation',
       'pre_chemo', 'first_line_io', 'io_type', 'io_start_date',
       'io_finish_date', 'date_progression', 'date_death',
       'date_last_follow_up', 'progression', 'death', 'recist_9weeks',
       'recist_6months', 'recist_12months', 'best_clinical_response', 'orr',
       'pfs_days', 'pfs_months', 'os_days', 'os_months', 'pfs_6months',
       'pfs_1year', 'pfs_2year', 'os_1year', 'sex_Female', 'sex_Male',
       'smoke_Current', 'smoke_Former', 'smoke_Never'],
      dtype='object')

In [12]:
x_one_hot = clinical_chum_one_hot[['age', 'weight', 'height','sex_Female', 'sex_Male',
       'smoke_Current', 'smoke_Former', 'smoke_Never']]
y = clinical_chum_one_hot["os_days"]
mult_lin_reg(x_one_hot, y)

Intercept:  -686.3997724059951
Coefficients:
R squared: 4.48
Mean Absolute Error: 333.7234081060195
Mean Square Error: 170746.4836686787
Root Mean Square Error: 413.2148154031734
