In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from tqdm import tqdm

# Load the data (we know it works with ISO-8859-1 encoding)
df = pd.read_csv('cancer_reg.csv', encoding='ISO-8859-1')

# Remove rows with missing values
df_clean = df.dropna()

In [11]:
import pandas as pd
import numpy as np

# Load the data
df = pd.read_csv('cancer_reg.csv', encoding='ISO-8859-1')

# Display data types
print("Data types of columns:")
print(df.dtypes)

# Display unique values in 'binnedInc' column
print("\
Unique values in 'binnedInc' column:")
print(df['binnedInc'].unique())

# Display information about non-numeric columns
non_numeric = df.select_dtypes(exclude=[np.number])
print("\
Non-numeric columns:")
for col in non_numeric.columns:
    print(f"\
{col}:")
    print(df[col].value_counts().head())

# Check for missing values
print("\
Missing values:")
print(df.isnull().sum())

Data types of columns:
avgAnnCount                float64
avgDeathsPerYear             int64
TARGET_deathRate           float64
incidenceRate              float64
medIncome                    int64
popEst2015                   int64
povertyPercent             float64
studyPerCap                float64
binnedInc                   object
MedianAge                  float64
MedianAgeMale              float64
MedianAgeFemale            float64
Geography                   object
AvgHouseholdSize           float64
PercentMarried             float64
PctNoHS18_24               float64
PctHS18_24                 float64
PctSomeCol18_24            float64
PctBachDeg18_24            float64
PctHS25_Over               float64
PctBachDeg25_Over          float64
PctEmployed16_Over         float64
PctUnemployed16_Over       float64
PctPrivateCoverage         float64
PctPrivateCoverageAlone    float64
PctEmpPrivCoverage         float64
PctPublicCoverage          float64
PctPublicCoverageAlone     float

In [None]:
# Separate features and target
X = df_clean.drop(['TARGET_deathRate', 'Geography'], axis=1)
y = df_clean['TARGET_deathRate']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Function to evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mse, r2

In [18]:
# Redefine the evaluate_model function
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mse, r2
lr = LinearRegression()
lr_mse, lr_r2 = evaluate_model(lr, X_train_scaled, X_test_scaled, y_train, y_test)

# Ridge Regression
ridge = Ridge(alpha=1.0)
ridge_mse, ridge_r2 = evaluate_model(ridge, X_train_scaled, X_test_scaled, y_train, y_test)

# Lasso Regression
lasso = Lasso(alpha=1.0)
lasso_mse, lasso_r2 = evaluate_model(lasso, X_train_scaled, X_test_scaled, y_train, y_test)

print("Model Evaluation Results:")
print(f"Linear Regression - MSE: {lr_mse:.2f}, R2: {lr_r2:.2f}")
print(f"Ridge Regression - MSE: {ridge_mse:.2f}, R2: {ridge_r2:.2f}")
print(f"Lasso Regression - MSE: {lasso_mse:.2f}, R2: {lasso_r2:.2f}")

# Feature selection using Decision Tree
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)

# Get feature importances
importances = dt.feature_importances_
feature_importances = pd.DataFrame({'feature': X.columns, 'importance': importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)

# Select top 10 features
top_features = feature_importances.head(10)['feature'].tolist()

print("\
Top 10 features selected by Decision Tree:")
print(top_features)

# Fit Linear Regression with selected features
X_train_selected = X_train[top_features]
X_test_selected = X_test[top_features]

lr_selected = LinearRegression()
lr_selected_mse, lr_selected_r2 = evaluate_model(lr_selected, X_train_selected, X_test_selected, y_train, y_test)

print("\
Linear Regression with selected features:")
print(f"MSE: {lr_selected_mse:.2f}, R2: {lr_selected_r2:.2f}")

# Visualize feature importances
plt.figure(figsize=(12, 6))
plt.bar(top_features, feature_importances.head(10)['importance'])
plt.title('Top 10 Feature Importances')
plt.xlabel('Features')
plt.ylabel('Importance')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Re-evaluate the models
lr_mse, lr_r2 = evaluate_model(lr, X_train_scaled, X_test_scaled, y_train, y_test)
ridge_mse, ridge_r2 = evaluate_model(ridge, X_train_scaled, X_test_scaled, y_train, y_test)
lasso_mse, lasso_r2 = evaluate_model(lasso, X_train_scaled, X_test_scaled, y_train, y_test)

print("\
Model Evaluation Results after encoding:")
print(f"Linear Regression - MSE: {lr_mse:.2f}, R2: {lr_r2:.2f}")
print(f"Ridge Regression - MSE: {ridge_mse:.2f}, R2: {ridge_r2:.2f}")
print(f"Lasso Regression - MSE: {lasso_mse:.2f}, R2: {lasso_r2:.2f}")

# Feature selection using Decision Tree
dt.fit(X_train, y_train)
importances = dt.feature_importances_
feature_importances = pd.DataFrame({'feature': X.columns, 'importance': importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)

top_features = feature_importances.head(10)['feature'].tolist()

print("\
Top 10 features selected by Decision Tree:")
print(top_features)

# Fit Linear Regression with selected features
X_train_selected = X_train[top_features]
X_test_selected = X_test[top_features]

lr_selected_mse, lr_selected_r2 = evaluate_model(lr_selected, X_train_selected, X_test_selected, y_train, y_test)

print("\
Linear Regression with selected features:")
print(f"MSE: {lr_selected_mse:.2f}, R2: {lr_selected_r2:.2f}")

# Visualize feature importances
plt.figure(figsize=(12, 6))
plt.bar(top_features, feature_importances.head(10)['importance'])
plt.title('Top 10 Feature Importances')
plt.xlabel('Features')
plt.ylabel('Importance')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [17]:
from sklearn.preprocessing import LabelEncoder

# Encode 'binnedInc' column
label_encoder = LabelEncoder()
df['binnedInc_encoded'] = label_encoder.fit_transform(df['binnedInc'])

# Drop the original 'binnedInc' and 'Geography' columns
df_encoded = df.drop(['binnedInc', 'Geography'], axis=1)

# Check the first few rows of the updated dataframe
print("Updated DataFrame with encoded 'binnedInc':")
print(df_encoded.head())

# Proceed with the analysis using the updated dataframe
# Separate features and target
X = df_encoded.drop('TARGET_deathRate', axis=1)
y = df_encoded['TARGET_deathRate']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Evaluate models again
lr_mse, lr_r2 = evaluate_model(lr, X_train_scaled, X_test_scaled, y_train, y_test)
ridge_mse, ridge_r2 = evaluate_model(ridge, X_train_scaled, X_test_scaled, y_train, y_test)
lasso_mse, lasso_r2 = evaluate_model(lasso, X_train_scaled, X_test_scaled, y_train, y_test)

print("\
Model Evaluation Results after encoding:")
print(f"Linear Regression - MSE: {lr_mse:.2f}, R2: {lr_r2:.2f}")
print(f"Ridge Regression - MSE: {ridge_mse:.2f}, R2: {ridge_r2:.2f}")
print(f"Lasso Regression - MSE: {lasso_mse:.2f}, R2: {lasso_r2:.2f}")

# Feature selection using Decision Tree
dt.fit(X_train, y_train)
importances = dt.feature_importances_
feature_importances = pd.DataFrame({'feature': X.columns, 'importance': importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)

top_features = feature_importances.head(10)['feature'].tolist()

print("\
Top 10 features selected by Decision Tree:")
print(top_features)

# Fit Linear Regression with selected features
X_train_selected = X_train[top_features]
X_test_selected = X_test[top_features]

lr_selected_mse, lr_selected_r2 = evaluate_model(lr_selected, X_train_selected, X_test_selected, y_train, y_test)

print("\
Linear Regression with selected features:")
print(f"MSE: {lr_selected_mse:.2f}, R2: {lr_selected_r2:.2f}")

# Visualize feature importances
plt.figure(figsize=(12, 6))
plt.bar(top_features, feature_importances.head(10)['importance'])
plt.title('Top 10 Feature Importances')
plt.xlabel('Features')
plt.ylabel('Importance')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

Updated DataFrame with encoded 'binnedInc':
   avgAnnCount  avgDeathsPerYear  TARGET_deathRate  incidenceRate  medIncome  \
0       1397.0               469             164.9          489.8      61898   
1        173.0                70             161.3          411.6      48127   
2        102.0                50             174.7          349.7      49348   
3        427.0               202             194.8          430.4      44243   
4         57.0                26             144.4          350.1      49955   

   popEst2015  povertyPercent  studyPerCap  MedianAge  MedianAgeMale  ...  \
0      260131            11.2   499.748204       39.3           36.9  ...   
1       43269            18.6    23.111234       33.0           32.2  ...   
2       21026            14.6    47.560164       45.0           44.0  ...   
3       75882            17.1   342.637253       42.8           42.2  ...   
4       10321            12.5     0.000000       48.3           47.8  ...   

   PctEmpPri

ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values