# Task-3 Performing different regressions and using a new loss function

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('cancer_reg.csv', encoding='iso-8859-1')
print(df.head())

   avgAnnCount  avgDeathsPerYear  TARGET_deathRate  incidenceRate  medIncome  \
0       1397.0               469             164.9          489.8      61898   
1        173.0                70             161.3          411.6      48127   
2        102.0                50             174.7          349.7      49348   
3        427.0               202             194.8          430.4      44243   
4         57.0                26             144.4          350.1      49955   

   popEst2015  povertyPercent  studyPerCap           binnedInc  MedianAge  \
0      260131            11.2   499.748204   (61494.5, 125635]       39.3   
1       43269            18.6    23.111234  (48021.6, 51046.4]       33.0   
2       21026            14.6    47.560164  (48021.6, 51046.4]       45.0   
3       75882            17.1   342.637253    (42724.4, 45201]       42.8   
4       10321            12.5     0.000000  (48021.6, 51046.4]       48.3   

   ...  PctPrivateCoverageAlone  PctEmpPrivCoverage PctP

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3047 entries, 0 to 3046
Data columns (total 34 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   avgAnnCount              3047 non-null   float64
 1   avgDeathsPerYear         3047 non-null   int64  
 2   TARGET_deathRate         3047 non-null   float64
 3   incidenceRate            3047 non-null   float64
 4   medIncome                3047 non-null   int64  
 5   popEst2015               3047 non-null   int64  
 6   povertyPercent           3047 non-null   float64
 7   studyPerCap              3047 non-null   float64
 8   binnedInc                3047 non-null   object 
 9   MedianAge                3047 non-null   float64
 10  MedianAgeMale            3047 non-null   float64
 11  MedianAgeFemale          3047 non-null   float64
 12  Geography                3047 non-null   object 
 13  AvgHouseholdSize         3047 non-null   float64
 14  PercentMarried          

In [4]:
df.shape

(3047, 34)

In [5]:
df.describe()

Unnamed: 0,avgAnnCount,avgDeathsPerYear,TARGET_deathRate,incidenceRate,medIncome,popEst2015,povertyPercent,studyPerCap,MedianAge,MedianAgeMale,...,PctPrivateCoverageAlone,PctEmpPrivCoverage,PctPublicCoverage,PctPublicCoverageAlone,PctWhite,PctBlack,PctAsian,PctOtherRace,PctMarriedHouseholds,BirthRate
count,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,...,2438.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0
mean,606.338544,185.965868,178.664063,448.268586,47063.281917,102637.4,16.878175,155.399415,45.272333,39.570725,...,48.453774,41.196324,36.252642,19.240072,83.645286,9.107978,1.253965,1.983523,51.243872,5.640306
std,1416.356223,504.134286,27.751511,54.560733,12040.090836,329059.2,6.409087,529.628366,45.30448,5.226017,...,10.083006,9.447687,7.841741,6.113041,16.380025,14.534538,2.610276,3.51771,6.572814,1.985816
min,6.0,3.0,59.7,201.3,22640.0,827.0,3.2,0.0,22.3,22.4,...,15.7,13.5,11.2,2.6,10.199155,0.0,0.0,0.0,22.99249,0.0
25%,76.0,28.0,161.2,420.3,38882.5,11684.0,12.15,0.0,37.7,36.35,...,41.0,34.5,30.9,14.85,77.29618,0.620675,0.254199,0.295172,47.763063,4.521419
50%,171.0,61.0,178.1,453.549422,45207.0,26643.0,15.9,0.0,41.0,39.6,...,48.7,41.1,36.3,18.8,90.059774,2.247576,0.549812,0.826185,51.669941,5.381478
75%,518.0,149.0,195.2,480.85,52492.0,68671.0,20.4,83.650776,44.0,42.5,...,55.6,47.7,41.55,23.1,95.451693,10.509732,1.221037,2.17796,55.395132,6.493677
max,38150.0,14010.0,362.8,1206.9,125635.0,10170290.0,47.4,9762.308998,624.0,64.7,...,78.9,70.7,65.1,46.6,100.0,85.947799,42.619425,41.930251,78.075397,21.326165


In [6]:
df.dtypes

avgAnnCount                float64
avgDeathsPerYear             int64
TARGET_deathRate           float64
incidenceRate              float64
medIncome                    int64
popEst2015                   int64
povertyPercent             float64
studyPerCap                float64
binnedInc                   object
MedianAge                  float64
MedianAgeMale              float64
MedianAgeFemale            float64
Geography                   object
AvgHouseholdSize           float64
PercentMarried             float64
PctNoHS18_24               float64
PctHS18_24                 float64
PctSomeCol18_24            float64
PctBachDeg18_24            float64
PctHS25_Over               float64
PctBachDeg25_Over          float64
PctEmployed16_Over         float64
PctUnemployed16_Over       float64
PctPrivateCoverage         float64
PctPrivateCoverageAlone    float64
PctEmpPrivCoverage         float64
PctPublicCoverage          float64
PctPublicCoverageAlone     float64
PctWhite            

In [7]:
df.isnull().sum()

avgAnnCount                   0
avgDeathsPerYear              0
TARGET_deathRate              0
incidenceRate                 0
medIncome                     0
popEst2015                    0
povertyPercent                0
studyPerCap                   0
binnedInc                     0
MedianAge                     0
MedianAgeMale                 0
MedianAgeFemale               0
Geography                     0
AvgHouseholdSize              0
PercentMarried                0
PctNoHS18_24                  0
PctHS18_24                    0
PctSomeCol18_24            2285
PctBachDeg18_24               0
PctHS25_Over                  0
PctBachDeg25_Over             0
PctEmployed16_Over          152
PctUnemployed16_Over          0
PctPrivateCoverage            0
PctPrivateCoverageAlone     609
PctEmpPrivCoverage            0
PctPublicCoverage             0
PctPublicCoverageAlone        0
PctWhite                      0
PctBlack                      0
PctAsian                      0
PctOther

In [8]:
#df.drop(columns='PctSomeCol18_24',inplace=True)

In [9]:
numeric_df = df.select_dtypes(include=[np.number])
numeric_df.fillna(numeric_df.mean(), inplace=True)

In [10]:
y = numeric_df['TARGET_deathRate']
x = numeric_df.drop(columns=['TARGET_deathRate'])

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [12]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [13]:
linear_reg = LinearRegression()
linear_reg.fit(x_train_scaled, y_train)
linear_pred = linear_reg.predict(x_test_scaled)
linear_mse = mean_squared_error(y_test, linear_pred)
linear_r2 = r2_score(y_test, linear_pred)
print("Linear Regression MSE:", linear_mse)
print("Linear Regression R^2:", linear_r2)

Linear Regression MSE: 413.7612858444205
Linear Regression R^2: 0.49433098160932554


In [14]:
lasso_reg = Lasso(alpha=0.1)
lasso_reg.fit(x_train_scaled, y_train)
lasso_pred = lasso_reg.predict(x_test_scaled)
lasso_mse = mean_squared_error(y_test, lasso_pred)
lasso_r2 = r2_score(y_test, lasso_pred)
print("Lasso Regression MSE:", lasso_mse)
print("Lasso Regression R^2:", lasso_r2)

Lasso Regression MSE: 411.74639749259114
Lasso Regression R^2: 0.49679343193973047


In [15]:
ridge_reg = Ridge(alpha=1.0)
ridge_reg.fit(x_train_scaled, y_train)
ridge_pred = ridge_reg.predict(x_test_scaled)
ridge_mse = mean_squared_error(y_test, ridge_pred)
ridge_r2 = r2_score(y_test, ridge_pred)
print("Ridge Regression MSE:", ridge_mse)
print("Ridge Regression R^2:", ridge_r2)

Ridge Regression MSE: 413.6491527054773
Ridge Regression R^2: 0.4944680225946433


In [16]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import HuberRegressor

In [17]:
huber_regressor = HuberRegressor()
huber_regressor.fit(x_train_scaled, y_train)

In [18]:
importances = np.abs(huber_regressor.coef_)

In [19]:
threshold = 0.01
important_features = x.columns[importances > threshold]

In [20]:
x_train_important = x_train_scaled[:, importances > threshold]
x_test_important = x_test_scaled[:, importances > threshold]

In [21]:
linear_reg_important = LinearRegression()
linear_reg_important.fit(x_train_important, y_train)

In [22]:
important_pred = linear_reg_important.predict(x_test_important)
important_mse = mean_squared_error(y_test, important_pred)
important_r2 = r2_score(y_test, important_pred)

In [23]:
print("Selected Features:", important_features)
print("\nLinear Regression with Selected Features MSE:", important_mse)
print("Linear Regression with Selected Features R^2:", important_r2)

Selected Features: Index(['avgAnnCount', 'avgDeathsPerYear', 'incidenceRate', 'medIncome',
       'popEst2015', 'povertyPercent', 'studyPerCap', 'MedianAge',
       'MedianAgeMale', 'MedianAgeFemale', 'AvgHouseholdSize',
       'PercentMarried', 'PctNoHS18_24', 'PctHS18_24', 'PctSomeCol18_24',
       'PctBachDeg18_24', 'PctHS25_Over', 'PctBachDeg25_Over',
       'PctEmployed16_Over', 'PctUnemployed16_Over', 'PctPrivateCoverage',
       'PctPrivateCoverageAlone', 'PctEmpPrivCoverage', 'PctPublicCoverage',
       'PctPublicCoverageAlone', 'PctWhite', 'PctBlack', 'PctAsian',
       'PctOtherRace', 'PctMarriedHouseholds', 'BirthRate'],
      dtype='object')

Linear Regression with Selected Features MSE: 413.7612858444205
Linear Regression with Selected Features R^2: 0.49433098160932554
