## Import Libraries

In [1]:
import pandas as pd

## Get Raw Data

In [18]:
rawdata=pd.read_csv("rawdata.csv")
rawdata.head()

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020/12/01',110,130,409.1
1,60,'2020/12/02',117,145,479.0
2,60,'2020/12/03',103,135,340.0
3,45,'2020/12/04',109,175,282.4
4,45,'2020/12/05',117,148,406.0


In [3]:
print(rawdata.size)
print(rawdata.shape)

160
(32, 5)


In [4]:
rawdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Duration  32 non-null     int64  
 1   Date      31 non-null     object 
 2   Pulse     32 non-null     int64  
 3   Maxpulse  32 non-null     int64  
 4   Calories  30 non-null     float64
dtypes: float64(1), int64(3), object(1)
memory usage: 1.4+ KB


## Drop Empty Cells

In [5]:
rawdata.isna().sum()

Duration    0
Date        1
Pulse       0
Maxpulse    0
Calories    2
dtype: int64

In [6]:
rawdata.dropna(inplace=True)

In [7]:
print(rawdata.shape)
rawdata.isna().sum()

(29, 5)


Duration    0
Date        0
Pulse       0
Maxpulse    0
Calories    0
dtype: int64

## Replace Empty Values with Specific Values

Getting new data

In [9]:
rawdata.Calories.mean()

304.68

In [10]:
rawdata.Calories.fillna(rawdata.Calories.mean(),inplace=True)

In [11]:
rawdata.isna().sum()

Duration    0
Date        1
Pulse       0
Maxpulse    0
Calories    0
dtype: int64

In [12]:
rawdata.Date.fillna("2020/12/21",inplace=True)

In [13]:
rawdata.isna().sum()

Duration    0
Date        0
Pulse       0
Maxpulse    0
Calories    0
dtype: int64

## Dealing with wrong Data

Getting new Data

In [14]:
rawdata.Duration[rawdata["Duration"]>120].size

1

In [15]:
rawdata.Duration[rawdata["Duration"]>120]=120

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rawdata.Duration[rawdata["Duration"]>120]=120


In [16]:
rawdata.Duration[rawdata["Duration"]>120].size

0

In [17]:
rawdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Duration  32 non-null     int64  
 1   Date      32 non-null     object 
 2   Pulse     32 non-null     int64  
 3   Maxpulse  32 non-null     int64  
 4   Calories  32 non-null     float64
dtypes: float64(1), int64(3), object(1)
memory usage: 1.4+ KB


## Removing Duplicated Values

Getting new Data

In [19]:
rawdata.duplicated().sum()

1

In [20]:
rawdata.drop_duplicates(inplace=True)

In [21]:
rawdata.duplicated().sum()

0

## Exporting Dataframe

In [None]:
rawdata.to_csv("updated_rawdata.csv")

# Post Lab Task

## Importing Libraries

In [57]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
# Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

In [76]:
diabetes=pd.read_csv("diabetes.csv")
diabetes.shape

(768, 9)

In [77]:
diabetes.isna().size

6912

In [78]:
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [79]:
diabetes.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [80]:
diabetes.duplicated().sum()

0

## Normalization

We are applying z-score standarderization

In [81]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
diabetes[["Pregnancies","Glucose","BloodPressure","SkinThickness"]]=scaler.fit_transform(diabetes[["Pregnancies","Glucose","BloodPressure","SkinThickness"]])
diabetes[["Insulin","BMI","DiabetesPedigreeFunction","Age"]]=scaler.fit_transform(diabetes[["Insulin","BMI","DiabetesPedigreeFunction","Age"]])
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.352941,0.743719,0.590164,0.353535,0.0,0.500745,0.234415,0.483333,1
1,0.058824,0.427136,0.540984,0.292929,0.0,0.396423,0.116567,0.166667,0
2,0.470588,0.919598,0.52459,0.0,0.0,0.347243,0.253629,0.183333,1
3,0.058824,0.447236,0.540984,0.232323,0.111111,0.418778,0.038002,0.0,0
4,0.0,0.688442,0.327869,0.353535,0.198582,0.642325,0.943638,0.2,1


## Applying ML Algorithm

### Spliting Data

In [82]:
y=diabetes.Outcome
diabetes.drop(columns=["Outcome"],axis=1,inplace=True)
# Split Data into a training set and test set
X_train,X_test,y_train,y_test = train_test_split(diabetes,y,test_size=0.3 ,random_state=26)
#Note: Here x includes the features or independent variables where y includes the dependent variable variables or label.

In [85]:
# Fitting with Logistic Regression
lr = LogisticRegression(max_iter=100)
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)

In [84]:
# Compute Model Accuracy
print("Accuracy:", metrics.accuracy_score(y_test, y_pred)*100)

Accuracy: 79.22077922077922
