# Linear Regression

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import confusion_matrix

In [2]:
 #Import the original dataset
df = pd.read_csv('dataset.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [3]:
df.describe(include = 'all')

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
count,918.0,918,918,918.0,918.0,918.0,918,918.0,918,918.0,918,918.0
unique,,2,4,,,,3,,2,,3,
top,,M,ASY,,,,Normal,,N,,Flat,
freq,,725,496,,,,552,,547,,460,
mean,53.510893,,,132.396514,198.799564,0.233115,,136.809368,,0.887364,,0.553377
std,9.432617,,,18.514154,109.384145,0.423046,,25.460334,,1.06657,,0.497414
min,28.0,,,0.0,0.0,0.0,,60.0,,-2.6,,0.0
25%,47.0,,,120.0,173.25,0.0,,120.0,,0.0,,0.0
50%,54.0,,,130.0,223.0,0.0,,138.0,,0.6,,1.0
75%,60.0,,,140.0,267.0,0.0,,156.0,,1.5,,1.0


In [4]:
df_noRBP = pd.read_csv('dataset.csv')
df_noRBP = df.drop(df[df.RestingBP == 0].index)
df_noRBP.info()
#Drop the only row that has no RBP so now we try to predict the values of the missing cholesterol data.

<class 'pandas.core.frame.DataFrame'>
Int64Index: 917 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             917 non-null    int64  
 1   Sex             917 non-null    object 
 2   ChestPainType   917 non-null    object 
 3   RestingBP       917 non-null    int64  
 4   Cholesterol     917 non-null    int64  
 5   FastingBS       917 non-null    int64  
 6   RestingECG      917 non-null    object 
 7   MaxHR           917 non-null    int64  
 8   ExerciseAngina  917 non-null    object 
 9   Oldpeak         917 non-null    float64
 10  ST_Slope        917 non-null    object 
 11  HeartDisease    917 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 93.1+ KB


In [5]:
#Transform the categorical data in binary features¶
df_dum = pd.get_dummies(df_noRBP)

Let's use a linear regression to predict the missing values of cholesterol.


In [6]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
df_dum.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_F,Sex_M,ChestPainType_ASY,...,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,0,0,1,0,...,0,0,0,1,0,1,0,0,0,1
1,49,160,180,0,156,1.0,1,1,0,0,...,1,0,0,1,0,1,0,0,1,0
2,37,130,283,0,98,0.0,0,0,1,0,...,0,0,0,0,1,1,0,0,0,1
3,48,138,214,0,108,1.5,1,1,0,1,...,0,0,0,1,0,0,1,0,1,0
4,54,150,195,0,122,0.0,0,0,1,0,...,1,0,0,1,0,1,0,0,0,1


In [7]:
testdf_dum = df_dum[df_dum['Cholesterol']==0] #The ones we need to predict
traindf_dum = df_dum[df_dum['Cholesterol']!=0] #The ones used for training
y = traindf_dum['Cholesterol'] #cholesterol values
traindf_dum.drop("Cholesterol",axis=1,inplace=True)
lr.fit(traindf_dum,y) #Fit available cholesterol with it's data
#traindf_dum.info()
#testdf_dum.info()
testdf_dum.drop("Cholesterol",axis=1,inplace=True)
pred = lr.predict(testdf_dum) # Predict the missing values
testdf_dum.info()
testdf_dum['Cholesterol']= pred #Assign the new values creating again the column Cholesterol
traindf_dum['Cholesterol']= y # Assing again the old cholesterol values 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 171 entries, 293 to 536
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Age                171 non-null    int64  
 1   RestingBP          171 non-null    int64  
 2   FastingBS          171 non-null    int64  
 3   MaxHR              171 non-null    int64  
 4   Oldpeak            171 non-null    float64
 5   HeartDisease       171 non-null    int64  
 6   Sex_F              171 non-null    uint8  
 7   Sex_M              171 non-null    uint8  
 8   ChestPainType_ASY  171 non-null    uint8  
 9   ChestPainType_ATA  171 non-null    uint8  
 10  ChestPainType_NAP  171 non-null    uint8  
 11  ChestPainType_TA   171 non-null    uint8  
 12  RestingECG_LVH     171 non-null    uint8  
 13  RestingECG_Normal  171 non-null    uint8  
 14  RestingECG_ST      171 non-null    uint8  
 15  ExerciseAngina_N   171 non-null    uint8  
 16  ExerciseAngina_Y   171 n

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testdf_dum['Cholesterol']= pred #Assign the new values creating again the column Cholesterol
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.

In [8]:
#testdf_dum.info()
#traindf_dum.info() 
result = pd.concat([testdf_dum, traindf_dum], ignore_index=True, sort=False) # concatenate both charts, the one with predicted
                                                                             # values and the old known values
result.info()
result.head()
result.tail()
#df_dum.info() #initial data with 0's in cholesterol

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 917 entries, 0 to 916
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Age                917 non-null    int64  
 1   RestingBP          917 non-null    int64  
 2   FastingBS          917 non-null    int64  
 3   MaxHR              917 non-null    int64  
 4   Oldpeak            917 non-null    float64
 5   HeartDisease       917 non-null    int64  
 6   Sex_F              917 non-null    uint8  
 7   Sex_M              917 non-null    uint8  
 8   ChestPainType_ASY  917 non-null    uint8  
 9   ChestPainType_ATA  917 non-null    uint8  
 10  ChestPainType_NAP  917 non-null    uint8  
 11  ChestPainType_TA   917 non-null    uint8  
 12  RestingECG_LVH     917 non-null    uint8  
 13  RestingECG_Normal  917 non-null    uint8  
 14  RestingECG_ST      917 non-null    uint8  
 15  ExerciseAngina_N   917 non-null    uint8  
 16  ExerciseAngina_Y   917 non

Unnamed: 0,Age,RestingBP,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_F,Sex_M,ChestPainType_ASY,ChestPainType_ATA,...,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up,Cholesterol
912,45,110,0,132,1.2,1,0,1,0,0,...,1,0,1,0,1,0,0,1,0,264.0
913,68,144,1,141,3.4,1,0,1,1,0,...,0,0,1,0,1,0,0,1,0,193.0
914,57,130,0,115,1.2,1,0,1,1,0,...,0,0,1,0,0,1,0,1,0,131.0
915,57,130,0,174,0.0,1,1,0,0,1,...,0,1,0,0,1,0,0,1,0,236.0
916,38,138,0,173,0.0,0,0,1,0,0,...,0,0,1,0,1,0,0,0,1,175.0


Now let's find out the accuracy

In [9]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,    
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

LogisticRegression()

In [10]:
print(result)
result.info()
y = result['HeartDisease']
result.drop("HeartDisease",axis=1,inplace=True)
X_train, X_test,y_train,y_test = train_test_split(result,y,test_size=0.3)
lr = LogisticRegression(max_iter=50000,solver='lbfgs')
lr.fit(X_train,y_train)
pred = lr.predict(X_test)
print(metrics.accuracy_score(pred,y_test)) 

     Age  RestingBP  FastingBS  MaxHR  Oldpeak  HeartDisease  Sex_F  Sex_M  \
0     65        115          0     93      0.0             1      0      1   
1     32         95          1    127      0.7             1      0      1   
2     61        105          1    110      1.5             1      0      1   
3     50        145          1    139      0.7             1      0      1   
4     57        110          1    131      1.4             1      0      1   
..   ...        ...        ...    ...      ...           ...    ...    ...   
912   45        110          0    132      1.2             1      0      1   
913   68        144          1    141      3.4             1      0      1   
914   57        130          0    115      1.2             1      0      1   
915   57        130          0    174      0.0             1      1      0   
916   38        138          0    173      0.0             0      0      1   

     ChestPainType_ASY  ChestPainType_ATA  ...  ChestPainType_T

Now we're going to predict de missing restingRBP value, it's only on one row. I'll do it without the rows that have 0 colesterol, for the moment

In [11]:
 #Import the original dataset
df = pd.read_csv('dataset.csv')
#df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [12]:
df_0c = pd.read_csv('dataset.csv')
df_dum0c = df.drop(df[df.Cholesterol == 0].index) # delete the ones that have 0 cholesterol
df_dum0c = pd.get_dummies(df_0c)
df_dum0c.info() #746 entries in df_0c 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Age                918 non-null    int64  
 1   RestingBP          918 non-null    int64  
 2   Cholesterol        918 non-null    int64  
 3   FastingBS          918 non-null    int64  
 4   MaxHR              918 non-null    int64  
 5   Oldpeak            918 non-null    float64
 6   HeartDisease       918 non-null    int64  
 7   Sex_F              918 non-null    uint8  
 8   Sex_M              918 non-null    uint8  
 9   ChestPainType_ASY  918 non-null    uint8  
 10  ChestPainType_ATA  918 non-null    uint8  
 11  ChestPainType_NAP  918 non-null    uint8  
 12  ChestPainType_TA   918 non-null    uint8  
 13  RestingECG_LVH     918 non-null    uint8  
 14  RestingECG_Normal  918 non-null    uint8  
 15  RestingECG_ST      918 non-null    uint8  
 16  ExerciseAngina_N   918 non

In [13]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [14]:
testdf_dum0c = df_dum0c[df_dum0c['Cholesterol']==0] #The ones we need to predict
#print(testdf_dum0c)
traindf_dum0c = df_dum0c[df_dum0c['Cholesterol']!=0] #The ones used for training
#print(traindf_dum0c)
y = traindf_dum0c['Cholesterol']
#print(y)
#print(traindf_dum0c)
traindf_dum0c.drop("Cholesterol",axis=1,inplace=True)
#print(traindf_dum0c)
lr.fit(traindf_dum0c,y) #Fit available restingBP with it's data
#traindf_dum0c.info()
#testdf_dum0c.info()
testdf_dum0c.drop("Cholesterol",axis=1,inplace=True)
pred = lr.predict(testdf_dum0c) # Predict the missing values
#print(pred)
#testdf_dum0c.info()
testdf_dum0c['Cholesterol']= pred #Assign the new values creating again the column restingBP
traindf_dum0c['Cholesterol']= y
testdf_dum0c.info()
traindf_dum0c.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 172 entries, 293 to 536
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Age                172 non-null    int64  
 1   RestingBP          172 non-null    int64  
 2   FastingBS          172 non-null    int64  
 3   MaxHR              172 non-null    int64  
 4   Oldpeak            172 non-null    float64
 5   HeartDisease       172 non-null    int64  
 6   Sex_F              172 non-null    uint8  
 7   Sex_M              172 non-null    uint8  
 8   ChestPainType_ASY  172 non-null    uint8  
 9   ChestPainType_ATA  172 non-null    uint8  
 10  ChestPainType_NAP  172 non-null    uint8  
 11  ChestPainType_TA   172 non-null    uint8  
 12  RestingECG_LVH     172 non-null    uint8  
 13  RestingECG_Normal  172 non-null    uint8  
 14  RestingECG_ST      172 non-null    uint8  
 15  ExerciseAngina_N   172 non-null    uint8  
 16  ExerciseAngina_Y   172 n

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testdf_dum0c['Cholesterol']= pred #Assign the new values creating again the column restingBP
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.

In [15]:
#testdf_dum0c.info() #good data
#traindf_dum0c.info() #good data
result0c = pd.concat([testdf_dum0c, traindf_dum0c], ignore_index=True, sort=False)
result0c.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Age                918 non-null    int64  
 1   RestingBP          918 non-null    int64  
 2   FastingBS          918 non-null    int64  
 3   MaxHR              918 non-null    int64  
 4   Oldpeak            918 non-null    float64
 5   HeartDisease       918 non-null    int64  
 6   Sex_F              918 non-null    uint8  
 7   Sex_M              918 non-null    uint8  
 8   ChestPainType_ASY  918 non-null    uint8  
 9   ChestPainType_ATA  918 non-null    uint8  
 10  ChestPainType_NAP  918 non-null    uint8  
 11  ChestPainType_TA   918 non-null    uint8  
 12  RestingECG_LVH     918 non-null    uint8  
 13  RestingECG_Normal  918 non-null    uint8  
 14  RestingECG_ST      918 non-null    uint8  
 15  ExerciseAngina_N   918 non-null    uint8  
 16  ExerciseAngina_Y   918 non

In [16]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,    
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

LogisticRegression()

In [17]:
print(result0c)
result0c.info()
y = result0c['HeartDisease']
result0c.drop("HeartDisease",axis=1,inplace=True)
X_train, X_test,y_train,y_test = train_test_split(result0c,y,test_size=0.3)
lr = LogisticRegression(max_iter=50000,solver='lbfgs')
lr.fit(X_train,y_train)
pred = lr.predict(X_test)
print(metrics.accuracy_score(pred,y_test)) 

     Age  RestingBP  FastingBS  MaxHR  Oldpeak  HeartDisease  Sex_F  Sex_M  \
0     65        115          0     93      0.0             1      0      1   
1     32         95          1    127      0.7             1      0      1   
2     61        105          1    110      1.5             1      0      1   
3     50        145          1    139      0.7             1      0      1   
4     57        110          1    131      1.4             1      0      1   
..   ...        ...        ...    ...      ...           ...    ...    ...   
913   45        110          0    132      1.2             1      0      1   
914   68        144          1    141      3.4             1      0      1   
915   57        130          0    115      1.2             1      0      1   
916   57        130          0    174      0.0             1      1      0   
917   38        138          0    173      0.0             0      0      1   

     ChestPainType_ASY  ChestPainType_ATA  ...  ChestPainType_T

In [18]:
print(np.sum(np.asarray(result0c["Cholesterol"] == 0)))

0


In [19]:
result0c.describe(include = 'all')

Unnamed: 0,Age,RestingBP,FastingBS,MaxHR,Oldpeak,Sex_F,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up,Cholesterol
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,0.233115,136.809368,0.887364,0.21024,0.78976,0.540305,0.188453,0.221133,0.050109,0.204793,0.601307,0.1939,0.595861,0.404139,0.068627,0.501089,0.430283,244.583359
std,9.432617,18.514154,0.423046,25.460334,1.06657,0.407701,0.407701,0.498645,0.391287,0.415236,0.218289,0.40377,0.489896,0.395567,0.490992,0.490992,0.252957,0.500271,0.495386,53.597055
min,28.0,0.0,0.0,60.0,-2.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,85.0
25%,47.0,120.0,0.0,120.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,214.0
50%,54.0,130.0,0.0,138.0,0.6,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,241.656063
75%,60.0,140.0,0.0,156.0,1.5,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,268.0
max,77.0,200.0,1.0,202.0,6.2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,603.0
