In [1]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
from sklearn.datasets import fetch_openml
X,y = fetch_openml('blood-transfusion-service-center', version=1,return_X_y=True)

print(X)

       V1    V2       V3    V4
0     2.0  50.0  12500.0  98.0
1     0.0  13.0   3250.0  28.0
2     1.0  16.0   4000.0  35.0
3     2.0  20.0   5000.0  45.0
4     1.0  24.0   6000.0  77.0
..    ...   ...      ...   ...
743  23.0   2.0    500.0  38.0
744  21.0   2.0    500.0  52.0
745  23.0   3.0    750.0  62.0
746  39.0   1.0    250.0  39.0
747  72.0   1.0    250.0  72.0

[748 rows x 4 columns]


In [3]:
#handling the missing values:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X)
X=imputer.transform(X)

In [4]:
X_train,X_test,y_train,y_test=train_test_split(X,y)  #original version of dataset
model = LogisticRegression()
model.fit(X_train, y_train)
print(model.score(X_test, y_test))

0.7486631016042781


In [5]:
#in logistic regression, I want to catch a linear relation between features and my target.
#so I try to normalize? the values with square root function.
df=pd.DataFrame(X)

V1list = df[0].values
for i in range(len(V1list)): V1list[i] = math.sqrt(V1list[i])

V2list = df[1].values
for i in range(len(V2list)): V2list[i] = math.sqrt(V2list[i])
    
V3list = df[2].values
for i in range(len(V3list)): V3list[i] = math.sqrt(V3list[i])

V4list = df[3].values
for i in range(len(V4list)): V4list[i] = math.sqrt(V4list[i])

In [6]:
X_train,X_test,y_train,y_test=train_test_split(df,y) #after 'the square root' operation
model = LogisticRegression()
model.fit(X_train, y_train)
print(model.score(X_test, y_test))

0.7647058823529411


In [7]:
cor_matrix = df.corr().abs()
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(np.bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]
print(to_drop)

[2]


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(np.bool))


In [8]:
df1 = df.drop(df.columns[to_drop], axis=1)
print(df1)

            0         1         3
0    1.414214  7.071068  9.899495
1    0.000000  3.605551  5.291503
2    1.000000  4.000000  5.916080
3    1.414214  4.472136  6.708204
4    1.000000  4.898979  8.774964
..        ...       ...       ...
743  4.795832  1.414214  6.164414
744  4.582576  1.414214  7.211103
745  4.795832  1.732051  7.874008
746  6.244998  1.000000  6.244998
747  8.485281  1.000000  8.485281

[748 rows x 3 columns]


In [9]:
X_train,X_test,y_train,y_test=train_test_split(df1,y)  #after dropping correlated features
model = LogisticRegression()
model.fit(X_train, y_train)
print(model.score(X_test, y_test))

0.7700534759358288


In [10]:
#first I handled the missing values
#Then I tried to make a more linear relation between features and the target
#Last, I dropped the correlated features