In [72]:
# Importing necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [74]:
# Step 1: Load the dataset
url = "https://raw.githubusercontent.com/ADITHYASNAIR2021/Dataset-cart/main/Lung%20Cancer.csv"
data = pd.read_csv(url)
print("First 5 rows of the dataset:\n", data.head())


First 5 rows of the dataset:
   Patient Id  Age  Gender  AirPollution  Alcoholuse  DustAllergy  \
0         P1   33       1             2           4            5   
1        P10   17       1             3           1            5   
2       P100   35       1             4           5            6   
3      P1000   37       1             7           7            7   
4       P101   46       1             6           8            7   

   OccuPationalHazards  GeneticRisk  chronicLungDisease  BalancedDiet  ...  \
0                    4            3                   2             2  ...   
1                    3            4                   2             2  ...   
2                    5            5                   4             6  ...   
3                    7            6                   7             7  ...   
4                    7            7                   6             7  ...   

   Fatigue  WeightLoss  ShortnessofBreath  Wheezing  SwallowingDifficulty  \
0        3     

In [75]:
# Check unique values in the 'Level' column
print("\nUnique values in 'Level' before replacement:\n", data['Level'].unique())

# Replacing categorical levels with numeric values
data['Level'] = data['Level'].replace({'Medium': 'High', 'High': 1, 'Low': 0})

# Check again after replacement
print("\nUnique values in 'Level' after replacement:\n", data['Level'].unique())



Unique values in 'Level' before replacement:
 ['Low' 'High' 'Medium']

Unique values in 'Level' after replacement:
 [0 1 'High']


In [78]:
# Ensuring all values in 'Level' are properly converted
data['Level'] = pd.to_numeric(data['Level'], errors='coerce')

# Dropping any rows with NaN values (if replacement was incomplete)
data.dropna(subset=['Level'], inplace=True)

print("\nData after processing:\n", data.head())


Data after processing:
   Patient Id  Age  Gender  AirPollution  Alcoholuse  DustAllergy  \
0         P1   33       1             2           4            5   
1        P10   17       1             3           1            5   
2       P100   35       1             4           5            6   
3      P1000   37       1             7           7            7   
4       P101   46       1             6           8            7   

   OccuPationalHazards  GeneticRisk  chronicLungDisease  BalancedDiet  ...  \
0                    4            3                   2             2  ...   
1                    3            4                   2             2  ...   
2                    5            5                   4             6  ...   
3                    7            6                   7             7  ...   
4                    7            7                   6             7  ...   

   Fatigue  WeightLoss  ShortnessofBreath  Wheezing  SwallowingDifficulty  \
0        3          

In [80]:
X = data.drop('Level', axis=1)  
y = data['Level'] 


In [82]:
# Step 5: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
print("\nTraining data size:", X_train.shape)
print("Testing data size:", X_test.shape)



Training data size: (501, 24)
Testing data size: (168, 24)


In [84]:
X_train = pd.get_dummies(X_train)

In [86]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)

In [88]:
# Step 6: Initialize and train the Logistic Regression model
Log_reg = LogisticRegression(fit_intercept=True, C=1e15, solver='newton-cg', max_iter=300, penalty='l2', multi_class='ovr')
Log_reg.fit(X_train, y_train)

In [90]:
# Ensure both train and test sets have the same one-hot encoding
X_test_encoded = pd.get_dummies(X_test)

# Align columns of X_test with X_train
X_test_encoded = X_test_encoded.reindex(columns=X_train.columns, fill_value=0)

# Now make predictions on the test data
Log_pred = Log_reg.predict(X_test_encoded)


In [92]:
# Step 8: Evaluate the model
lor_accuracy = accuracy_score(y_test, Log_pred)
print("\nAccuracy of the Logistic Regression model on test data:", lor_accuracy)



Accuracy of the Logistic Regression model on test data: 0.9940476190476191


In [94]:
# Step 9: Perform cross-validation to validate model performance
logreg = LogisticRegression(max_iter=1500)
scores_Log = cross_val_score(logreg, X_train, y_train, cv=20)
print("Cross-validated score (mean of 20-fold CV):", scores_Log.mean())


Cross-validated score (mean of 20-fold CV): 0.998


In [95]:
# Step 10: Take input for prediction
Age = 50
Gender = 1
AirPollution = 3
Alcoholuse = 1
DustAllergy = 2
OccuPationalHazards = 3
GeneticRisk = 2
chronicLungDisease = 1
BalancedDiet = 2
Obesity = 3
Smoking = 3
PassiveSmoker = 2
ChestPain = 3
CoughingofBlood = 2
Fatigue = 1
WeightLoss = 2
ShortnessofBreath = 2
Wheezing = 2
SwallowingDifficulty = 3
ClubbingofFingerNails = 1
FrequentCold = 2
DryCough = 3
Snoring = 3


In [96]:
# Step 11: Prepare the input data for prediction
data_input = {
    'Age': Age, 'Gender': Gender, 'AirPollution': AirPollution, 'Alcoholuse': Alcoholuse,
    'DustAllergy': DustAllergy, 'OccuPationalHazards': OccuPationalHazards, 'GeneticRisk': GeneticRisk,
    'chronicLungDisease': chronicLungDisease, 'BalancedDiet': BalancedDiet, 'Obesity': Obesity,
    'Smoking': Smoking, 'PassiveSmoker': PassiveSmoker, 'ChestPain': ChestPain, 'CoughingofBlood': CoughingofBlood,
    'Fatigue': Fatigue, 'WeightLoss': WeightLoss, 'ShortnessofBreath': ShortnessofBreath, 'Wheezing': Wheezing,
    'SwallowingDifficulty': SwallowingDifficulty, 'ClubbingofFingerNails': ClubbingofFingerNails,
    'FrequentCold': FrequentCold, 'DryCough': DryCough, 'Snoring': Snoring
}

In [97]:
# Step 12: Convert input to a DataFrame for prediction
cust_df = pd.DataFrame(data_input, index=[0])
print("\nInput data for prediction:\n", cust_df)



Input data for prediction:
    Age  Gender  AirPollution  Alcoholuse  DustAllergy  OccuPationalHazards  \
0   50       1             3           1            2                    3   

   GeneticRisk  chronicLungDisease  BalancedDiet  Obesity  ...  \
0            2                   1             2        3  ...   

   CoughingofBlood  Fatigue  WeightLoss  ShortnessofBreath  Wheezing  \
0                2        1           2                  2         2   

   SwallowingDifficulty  ClubbingofFingerNails  FrequentCold  DryCough  \
0                     3                      1             2         3   

   Snoring  
0        3  

[1 rows x 23 columns]


In [98]:
# Step 13: Make the prediction
cust_df_encoded = pd.get_dummies(cust_df)

# Align cust_df columns with X_train columns
cust_df_encoded = cust_df_encoded.reindex(columns=X_train.columns, fill_value=0)

# Now make predictions using the trained model
costpredLog = Log_reg.predict(cust_df_encoded)

In [104]:
# Step 14: Generate the final output
if costpredLog == 0:
    print("\nPrediction: There is less chance for the patient to catch cancer.")
else:
    print("\nPrediction: There is more chance for the patient to catch cancer.")


Prediction: There is less chance for the patient to catch cancer.
