# Train Test Split

In [41]:
# import library
import pandas as pd

In [42]:
# read data
diabetes = pd.read_csv('https://github.com/YBIFoundation/Dataset/raw/main/Diabetes.csv')

In [43]:
# display first 5 rows
diabetes.head()

Unnamed: 0,pregnancies,glucose,diastolic,triceps,insulin,bmi,dpf,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [44]:
# display columns
diabetes.columns

Index(['pregnancies', 'glucose', 'diastolic', 'triceps', 'insulin', 'bmi',
       'dpf', 'age', 'diabetes'],
      dtype='object')

In [45]:
# define target (y)
y = diabetes['diabetes']

In [46]:
# define features (X)
X = diabetes.drop(['diabetes'], axis=1)

In [47]:
X = diabetes[['pregnancies', 'glucose', 'diastolic', 'triceps', 'insulin', 'bmi',
       'dpf', 'age']]

In [48]:
# import train test split function
from sklearn.model_selection import train_test_split

In [49]:
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.7,stratify=y,random_state=2529)

In [50]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((537, 8), (231, 8), (537,), (231,))

In [51]:
X_train

Unnamed: 0,pregnancies,glucose,diastolic,triceps,insulin,bmi,dpf,age
377,1,87,60,37,75,37.2,0.509,22
393,4,116,72,12,87,22.1,0.463,37
662,8,167,106,46,231,37.6,0.165,43
367,0,101,64,17,0,21.0,0.252,21
233,4,122,68,0,0,35.0,0.394,29
...,...,...,...,...,...,...,...,...
689,1,144,82,46,180,46.1,0.335,46
730,3,130,78,23,79,28.4,0.323,34
464,10,115,98,0,0,24.0,1.022,34
223,7,142,60,33,190,28.8,0.687,61


# Simple Linear Regression


In [52]:
# Step 1 : import library
import pandas as pd

In [53]:
# Step 2 : import data
salary = pd.read_csv('https://github.com/ybifoundation/Dataset/raw/main/Salary%20Data.csv')

In [54]:
# Step 3 : define target (y) and features (X)
salary.columns

Index(['Experience Years', 'Salary'], dtype='object')

In [55]:
y = salary['Salary']
X = salary[['Experience Years']]

In [56]:
# Step 4 : train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.7, random_state=2529)

In [57]:
# check shape of train and test sample
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((28, 1), (12, 1), (28,), (12,))

In [58]:
# Step 5 : select model
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [59]:
# Step 6 : train or fit model
model.fit(X_train,y_train)

In [60]:
model.intercept_

26596.961311068262

In [61]:
model.coef_

array([9405.61663234])

In [62]:
# Step 7 : predict model
y_pred = model.predict(X_test)
y_pred

array([ 90555.15441095,  59516.61952424, 106544.70268592,  64219.42784041,
        68922.23615658, 123474.81262412,  84911.78443155,  63278.86617718,
        65159.98950364,  61397.74285071,  37883.70126987,  50111.00289191])

In [63]:
# Step 8 : model accuracy
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error

In [64]:
mean_absolute_error(y_test,y_pred)

4005.9263101681768

In [65]:
mean_absolute_percentage_error(y_test,y_pred)

0.06384602996141632

In [66]:
mean_squared_error(y_test,y_pred)

24141421.671440993

# Multiple Linear Regression

In [67]:
# Step 1 : import library
import pandas as pd

In [68]:
# Step 2 : import data
house = pd.read_csv('https://github.com/YBIFoundation/Dataset/raw/main/Boston.csv')

In [69]:
# Step 3 : define target (y) and features (X)
house.columns

Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'MEDV'],
      dtype='object')

In [70]:
y = house['MEDV']
X = house.drop(['MEDV'],axis=1)

In [71]:
# Step 4 : train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.7, random_state=2529)

In [72]:
# check shape of train and test sample
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((354, 13), (152, 13), (354,), (152,))

In [73]:
# Step 5 : select model
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [74]:
# Step 6 : train or fit model
model.fit(X_train,y_train)

In [75]:
model.intercept_

34.21916368862993

In [76]:
model.coef_

array([-1.29412069e-01,  3.65184937e-02,  1.54418944e-02,  2.35486887e+00,
       -2.04171489e+01,  4.41356565e+00,  4.61075512e-03, -1.58626723e+00,
        2.51478665e-01, -9.59591213e-03, -9.64169204e-01,  1.00972679e-02,
       -5.43198745e-01])

In [77]:
# Step 7 : predict model
y_pred = model.predict(X_test)
y_pred

array([31.71733828, 22.02143302, 21.16613197, 39.77837246, 20.10258512,
       22.86056216, 18.35574643, 14.7902735 , 22.55778646, 21.34594953,
       18.38491085, 27.9664665 , 29.85929012,  6.44680773, 10.68297311,
       26.24809521, 21.89368671, 25.22692365,  3.62385942, 36.21920372,
       24.07812335, 22.94103934, 14.27095261, 20.79013279, 24.22725035,
       16.7379611 , 18.74856986, 20.96709658, 28.513571  , 20.86346628,
        9.23450577, 17.06754852, 22.06953886, 22.23121875, 39.25875323,
       26.16769924, 42.50354003, 19.34517962, 34.51869058, 14.07023676,
       13.81055358, 23.27727535, 11.79100403,  9.01040731, 21.64587594,
       25.55339317, 18.16941728, 16.81991401, 14.66170215, 14.86477172,
       33.78924259, 33.26959074, 15.49208778, 24.08269034, 27.63531226,
       19.58288727, 45.02488529, 20.96959671, 20.07202649, 27.67146866,
       34.59154418, 12.71353064, 23.66247812, 31.65792337, 28.97459925,
       32.45963484, 13.93494747, 35.491924  , 19.35871482, 19.60

In [78]:
# Step 8 : model accuracy
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
mean_absolute_error(y_test,y_pred)

3.155030927602485

In [79]:
mean_absolute_percentage_error(y_test,y_pred)

0.1635593588221789

In [80]:
mean_squared_error(y_test,y_pred)

20.718012877838433

# Binary Classification

In [None]:
# Step 1 : import library
import pandas as pd

In [None]:
# Step 2 : import data
diabetes = pd.read_csv('https://github.com/YBIFoundation/Dataset/raw/main/Diabetes.csv')

In [None]:
diabetes.head()

Unnamed: 0,pregnancies,glucose,diastolic,triceps,insulin,bmi,dpf,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
diabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   pregnancies  768 non-null    int64  
 1   glucose      768 non-null    int64  
 2   diastolic    768 non-null    int64  
 3   triceps      768 non-null    int64  
 4   insulin      768 non-null    int64  
 5   bmi          768 non-null    float64
 6   dpf          768 non-null    float64
 7   age          768 non-null    int64  
 8   diabetes     768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [None]:
diabetes.describe()

Unnamed: 0,pregnancies,glucose,diastolic,triceps,insulin,bmi,dpf,age,diabetes
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [None]:
# Step 3 : define target (y) and features (X)
diabetes.columns

Index(['pregnancies', 'glucose', 'diastolic', 'triceps', 'insulin', 'bmi',
       'dpf', 'age', 'diabetes'],
      dtype='object')

In [None]:
y = diabetes['diabetes']
X = diabetes.drop(['diabetes'],axis=1)

In [None]:
# Step 4 : train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.7, random_state=2529)

In [None]:
# check shape of train and test sample
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((537, 8), (231, 8), (537,), (231,))

In [None]:
# Step 5 : select model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=500)

In [None]:
# Step 6 : train or fit model
model.fit(X_train,y_train)

In [None]:
model.intercept_

array([-8.13081966])

In [None]:
model.coef_

array([[ 1.01223082e-01,  3.60559605e-02, -2.09731012e-02,
        -2.57391062e-03, -2.04496125e-04,  8.24702415e-02,
         9.51115023e-01,  2.53544397e-02]])

In [None]:
# Step 7 : predict model
y_pred = model.predict(X_test)

In [None]:
y_pred

array([0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1])

In [None]:
# Step 8 : model accuracy
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [None]:
confusion_matrix(y_test,y_pred)

array([[133,  12],
       [ 41,  45]])

In [None]:
accuracy_score(y_test,y_pred)

0.7705627705627706

In [None]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.76      0.92      0.83       145
           1       0.79      0.52      0.63        86

    accuracy                           0.77       231
   macro avg       0.78      0.72      0.73       231
weighted avg       0.77      0.77      0.76       231



# Multi Category Classification

In [22]:
# Step 1 : import library
import pandas as pd

In [23]:
# Step 2 : import data
iris = pd.read_csv('https://github.com/YBIFoundation/Dataset/raw/main/IRIS.csv')

In [24]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [25]:
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [26]:
iris.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [27]:
# Step 3 : define target (y) and features (X)
iris.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

In [28]:
y = iris['species']
X = iris.drop(['species'],axis=1)


In [29]:
# Step 4 : train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.7, random_state=2529)

In [30]:
# check shape of train and test sample
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((105, 4), (45, 4), (105,), (45,))

In [31]:
# Step 5 : select model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=500)

In [32]:
# Step 6 : train or fit model
model.fit(X_train,y_train)

In [33]:
model.intercept_

array([  8.84567183,   1.90078065, -10.74645248])

In [34]:
model.coef_


array([[-0.40580785,  0.91752902, -2.30019191, -0.98597091],
       [ 0.40917695, -0.2375293 , -0.09285435, -0.75620424],
       [-0.0033691 , -0.67999971,  2.39304626,  1.74217515]])

In [35]:
# Step 7 : predict model
y_pred = model.predict(X_test)

In [36]:
y_pred

array(['Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-virginica',
       'Iris-virginica', 'Iris-versicolor', 'Iris-virginica',
       'Iris-versicolor', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-versicolor', 'Iris-virginica',
       'Iris-virginica', 'Iris-setosa', 'Iris-virginica',
       'Iris-versicolor', 'Iris-virginica', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-virginica', 'Iris-virginica',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-setosa',
       'Iris-virginica', 'Iris-virginica', 'Iris-setosa',
       'Iris-virginica', 'Iris-virginica', 'Iris-setosa', 'Iris-setosa',
       'Iris-virginica', 'Iris-virginica', 'Iris-versicolor',
       'Iris-setosa', 'Iris-virginica', 'Iris-setosa', 'Iris-virginica',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-virginica'], dtype=object)

In [37]:
# Step 8 : model accuracy
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [38]:
confusion_matrix(y_test,y_pred)

array([[14,  0,  0],
       [ 0,  9,  0],
       [ 0,  1, 21]])

In [39]:
accuracy_score(y_test,y_pred)

0.9777777777777777

In [40]:
print(classification_report(y_test,y_pred))

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        14
Iris-versicolor       0.90      1.00      0.95         9
 Iris-virginica       1.00      0.95      0.98        22

       accuracy                           0.98        45
      macro avg       0.97      0.98      0.97        45
   weighted avg       0.98      0.98      0.98        45

