<a href="https://colab.research.google.com/github/Hassan-293/Predict-Blood-Donation-for-Future-Expectancy/blob/main/Predict_Blood_Donation_for_Future_Expectancy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import numpy as np
import pandas as pd 
from scipy import stats
from sklearn import preprocessing
# for min_max scaling
from mlxtend.preprocessing import minmax_scaling
# plotting modules
import seaborn as sns
import matplotlib.pyplot as plt

**-----Data Inspection----**

In [6]:
df = pd.read_csv("transfusion.data")
df.head(750)

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0
...,...,...,...,...,...
743,23,2,500,38,0
744,21,2,500,52,0
745,23,3,750,62,0
746,39,1,250,39,0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 5 columns):
 #   Column                                      Non-Null Count  Dtype
---  ------                                      --------------  -----
 0   Recency (months)                            748 non-null    int64
 1   Frequency (times)                           748 non-null    int64
 2   Monetary (c.c. blood)                       748 non-null    int64
 3   Time (months)                               748 non-null    int64
 4   whether he/she donated blood in March 2007  748 non-null    int64
dtypes: int64(5)
memory usage: 29.3 KB


**Data Cleaning**

In [8]:
Null_Counts = df.isnull().sum()
print("The missing values in the each column are: ", Null_Counts)

The missing values in the each column are:  Recency (months)                              0
Frequency (times)                             0
Monetary (c.c. blood)                         0
Time (months)                                 0
whether he/she donated blood in March 2007    0
dtype: int64


In [9]:
Recency = df['Recency (months)'].unique()
Recency.sort()
Recency

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 20, 21, 22, 23, 25, 26, 35, 38, 39, 40, 72, 74])

In [10]:
Monetary = df['Monetary (c.c. blood)'].unique()
Monetary.sort()
Monetary

array([  250,   500,   750,  1000,  1250,  1500,  1750,  2000,  2250,
        2500,  2750,  3000,  3250,  3500,  3750,  4000,  4250,  4500,
        4750,  5000,  5250,  5500,  5750,  6000,  6500,  8250,  8500,
        9500, 10250, 10750, 11000, 11500, 12500])

In [11]:
Frequency = df['Frequency (times)'].unique()
Frequency.sort()
Frequency

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 26, 33, 34, 38, 41, 43, 44, 46, 50])

In [12]:
Time = df['Time (months)'].unique()
Time.sort()
Time

array([ 2,  3,  4,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23,
       24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
       41, 42, 43, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 57, 58, 59,
       60, 61, 62, 63, 64, 65, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
       81, 82, 83, 86, 87, 88, 89, 93, 95, 98])

In [16]:
Target = df['target'].unique()
Target.sort()
Target

array([0, 1])

  **Creating Target Column**

In [15]:
df.rename(
    columns={'whether he/she donated blood in March 2007':'target'},
    inplace=True
)
df.head()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),target
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0


**Data Split for Training and Testing**

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train,Y_test= train_test_split(
    df.drop(columns='target'),
    df.target,
    test_size=0.25,
    random_state=42,
    stratify=df.target

)
X_train.head(5)


Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months)
334,16,2,500,16
99,5,7,1750,26
116,2,7,1750,46
661,16,2,500,16
154,2,1,250,2
