# Lung cancer risk prediction

In [33]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
X=pd.read_csv('x_train.csv')
Y=pd.read_csv('y_train.csv')

In [34]:
X1data=X.iloc[:1250,:]
X2data=X.iloc[1250:1400,:]

In [35]:
Y1data=Y.iloc[:1250,:]
Y2data=Y.iloc[1250:1400,:]

In [36]:
data=pd.merge(X,Y,on='Id')
data

Unnamed: 0,Id,Age,Smoking History,Genetic Predisposition Score,Exposure to Carcinogens,Air Pollution Index,Dietary Habits Score,Physical Activity Level,BMI,Family History of Lung Cancer,Income Level,Risk
0,1,63,25,0.32,3,26,4,4,22.94,1,105904,19.0066
1,2,76,48,0.85,42,77,5,1,37.23,1,40271,67.6121
2,3,53,13,0.73,21,39,4,1,23.80,1,85875,7.3692
3,4,39,9,0.10,35,92,2,3,22.52,1,139509,5.0267
4,5,67,53,0.43,41,58,1,4,23.81,1,138469,79.1462
...,...,...,...,...,...,...,...,...,...,...,...,...
1395,1396,50,41,0.25,38,99,8,4,37.70,0,29610,49.2532
1396,1397,35,59,0.31,4,26,4,3,25.80,0,48615,95.9330
1397,1398,62,14,0.61,42,60,2,4,38.93,1,96644,10.5895
1398,1399,26,30,0.90,18,18,9,4,32.02,1,124135,26.9020


# Data Augmentation

In [37]:
data = np.array(data, dtype=np.float32)
shift_range = 0.55
scaling_factor = 0.005  
noise_stddev = 0.055

augmented_data = []

for value in data:
    
    jittered_value = value + np.random.uniform(-shift_range, shift_range)
    scaled_value = value * (1 + np.random.uniform(-scaling_factor, scaling_factor))
    noisy_value = value + np.random.normal(0, noise_stddev)
    augmented_data.extend([jittered_value, scaled_value, noisy_value])
augmented_data = np.array(augmented_data, dtype=np.float32)
column_names=['Id','Age','Smoking History','Genetic Predisposition Score','Exposure to Carcinogens','Air Pollution Index','Dietary Habits Score','Physical Activity Level','BMI','Family History of Lung Cancer','Income Level','Risk']
np.savetxt('augmented_data.csv', augmented_data, delimiter=',', header=','.join(column_names), comments='')
# np.savetxt('augmented_data.csv', augmented_data, delimiter=',')
print("Original Data Shape:", data.shape)
print("Augmented Data Shape:", augmented_data.shape)


Original Data Shape: (1400, 12)
Augmented Data Shape: (4200, 12)


In [38]:
df1=pd.read_csv('augmented_data.csv')
df1.head()

Unnamed: 0,Id,Age,Smoking History,Genetic Predisposition Score,Exposure to Carcinogens,Air Pollution Index,Dietary Habits Score,Physical Activity Level,BMI,Family History of Lung Cancer,Income Level,Risk
0,0.935804,62.935802,24.935804,0.255804,2.935804,25.935804,3.935804,3.935804,22.875805,0.935804,105903.9375,18.942404
1,1.000101,63.00634,25.002516,0.320032,3.000302,26.002617,4.000402,4.000402,22.942308,1.000101,105914.65625,19.008512
2,0.991732,62.991734,24.991732,0.311732,2.991732,25.991732,3.991732,3.991732,22.931732,0.991732,105903.992188,18.998331
3,2.056016,76.056015,48.056015,0.906016,42.056015,77.056015,5.056016,1.056016,37.286015,1.056016,40271.054688,67.668114
4,2.008922,76.33905,48.214134,0.853792,42.18737,77.343513,5.022306,1.004461,37.396088,1.004461,40450.65625,67.913727


In [39]:
df1=df1.drop(columns = ['Income Level'])
df1.head()

Unnamed: 0,Id,Age,Smoking History,Genetic Predisposition Score,Exposure to Carcinogens,Air Pollution Index,Dietary Habits Score,Physical Activity Level,BMI,Family History of Lung Cancer,Risk
0,0.935804,62.935802,24.935804,0.255804,2.935804,25.935804,3.935804,3.935804,22.875805,0.935804,18.942404
1,1.000101,63.00634,25.002516,0.320032,3.000302,26.002617,4.000402,4.000402,22.942308,1.000101,19.008512
2,0.991732,62.991734,24.991732,0.311732,2.991732,25.991732,3.991732,3.991732,22.931732,0.991732,18.998331
3,2.056016,76.056015,48.056015,0.906016,42.056015,77.056015,5.056016,1.056016,37.286015,1.056016,67.668114
4,2.008922,76.33905,48.214134,0.853792,42.18737,77.343513,5.022306,1.004461,37.396088,1.004461,67.913727


In [40]:
X1new=df1.iloc[:,1:10]
Y1new=df1.iloc[:,10]
X2new=X2data.iloc[:,1:10]
X2new.head()

Unnamed: 0,Age,Smoking History,Genetic Predisposition Score,Exposure to Carcinogens,Air Pollution Index,Dietary Habits Score,Physical Activity Level,BMI,Family History of Lung Cancer
1250,49,22,0.68,4,40,2,2,22.05,0
1251,41,3,0.58,22,32,7,3,24.33,1
1252,57,12,0.43,38,22,4,3,23.46,1
1253,71,28,0.28,23,93,2,3,33.07,0
1254,79,30,0.77,26,66,7,2,22.11,1


In [41]:
# from sklearn.model_selection import train_test_split
# x_train1,x_test1,y_train1,y_test1=train_test_split(X1new,Y1new,test_size=.2,random_state=45)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# Fit the scaler to the training data and transform the training data
X1new = scaler.fit_transform(X1new)

# Transform the testing data using the scaler fitted on the training data
X2new=X2data.iloc[:,1:10]
Y2new=Y2data.iloc[:,1]
X2new = scaler.transform(X2new)
# X2new

# Using Random Forest

In [51]:
from sklearn.ensemble import RandomForestRegressor
model1 = RandomForestRegressor()
model1.fit(X1new,Y1new)

In [52]:
predictions1 = model1.predict(X2new)
rmse2 = mean_squared_error(Y2new, predictions1, squared=False)
rmse2

0.22130431647855128

In [53]:
from sklearn.ensemble import GradientBoostingRegressor
model2 = GradientBoostingRegressor()
model2.fit(X1new,Y1new)

In [54]:
predictions2 = model2.predict(X2new)
rmse3 = mean_squared_error(Y2new, predictions2, squared=False)
rmse3

0.6902871440704961

# Using LinearGAM

In [55]:
from pygam import LinearGAM, s
from sklearn.metrics import mean_squared_error

# Define and fit the GAM model
gam = LinearGAM(s(0) + s(1) + s(2) + s(3) + s(4) + s(5)+s(6)+s(7)+s(8)).fit(X1new, Y1new)

# Predict on the testing data
y_pred6 = gam.predict(X2new)

rmse4 = mean_squared_error(Y2new, y_pred6, squared=False)
rmse4


0.736020839374386

In [56]:
test=pd.read_csv('test.csv')
test1=test.iloc[:,1:10]
test1

Unnamed: 0,Age,Smoking History,Genetic Predisposition Score,Exposure to Carcinogens,Air Pollution Index,Dietary Habits Score,Physical Activity Level,BMI,Family History of Lung Cancer
0,42,4,0.58,48,89,9,4,33.39,0
1,51,59,0.73,25,56,3,1,22.52,0
2,58,4,0.36,7,31,7,4,19.57,1
3,51,28,0.64,38,59,7,2,33.98,0
4,41,8,0.47,28,2,4,3,29.24,1
...,...,...,...,...,...,...,...,...,...
144,26,41,0.16,10,49,8,3,31.16,1
145,27,39,0.18,6,96,4,2,31.62,0
146,47,13,0.20,13,73,8,2,18.74,1
147,79,11,0.39,1,29,4,4,26.73,0


In [57]:
test2 = scaler.transform(test1)
prediction=gam.predict(test2)
prediction

array([ 3.66931713, 96.89072118,  2.97699626, 24.46036014,  4.26515322,
        3.22559129, 48.45298084, 61.66081113, 16.70414394, 48.96786533,
       88.09023234, 84.0601963 , 10.42957275, 25.79695505, 68.39297923,
       65.68219687, 27.0692391 ,  7.63742611, 35.56295913,  6.68181327,
       59.21584841, 70.75995793, 15.18444679, 15.97678496,  2.75258366,
       65.74965873, 13.61521066, 38.89858489,  3.33041495, 56.19806942,
       33.98450363, 78.58698223,  6.01575719,  4.54507872,  3.47202881,
       36.69531248, 11.29629873, 33.92442079, 16.92235398, 30.24548074,
        2.37488601,  5.3500683 , 57.37416969, 17.40837272, 61.9353699 ,
       46.22675122,  6.54122291, 82.65738558,  7.04324563, 61.00810507,
       63.44379905,  8.43790483, 47.74301179, 13.03085966, 20.25368136,
       74.77535955, 24.9866876 , 15.93455257, 46.9349335 , 14.70743784,
       74.58848394, 40.38774712, 14.01823591, 90.02299358, 27.04241806,
       74.94175583,  3.79551732, 85.43914511,  5.21247974, 22.11

In [58]:
import pandas as pd
df = pd.read_csv('test.csv') 
id_column = df['Id']  
id_array = id_column.to_numpy()

In [59]:
import pandas as pd
result_df = pd.DataFrame({'Id': id_array, 'Risk': prediction})
csv_filename = 'predicted_values3.csv'
result_df.to_csv(csv_filename, index=False)
print(f"Predicted values saved to {csv_filename}.")

Predicted values saved to predicted_values3.csv.
