In [5]:
import pandas as pd 
kyphosis = pd.read_csv('../Cases/Kyphosis/Kyphosis.csv')
kyphosis.head()

Unnamed: 0,Kyphosis,Age,Number,Start
0,absent,71,3,5
1,absent,158,3,14
2,present,128,4,5
3,absent,2,5,1
4,absent,1,4,15


In [6]:
X, y = kyphosis.drop('Kyphosis', axis=1), kyphosis['Kyphosis']

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=24, stratify=y)

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.76


### Optimization (solver, penalties, l1_ratio)

In [22]:
from tqdm import tqdm
import numpy as np
penalties = ['l2', 'l1', 'elasticnet', None]
scores = []
l1_ratios = np.linspace(0.001, 1, 5)
for i in tqdm(range(len(penalties))):
    for r in l1_ratios:
        # lr = LogisticRegression(penalty=penalties[i]) Solver lbfgs (is the default solver) supports only 'l2' or None penalties, got l1 penalty.
        lr = LogisticRegression(solver='saga', penalty=penalties[i], l1_ratio=r)
        lr.fit(X_train, y_train)
        y_pred = lr.predict(X_test)
        scores.append([penalties[i], r, accuracy_score(y_test, y_pred)])

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 51.98it/s]


In [26]:
df_scores = pd.DataFrame(scores, columns=['penalties', 'l1_ratio', 'accuracy'])
df_scores.sort_values('accuracy').iloc[0]

penalties       l2
l1_ratio     0.001
accuracy       0.8
Name: 0, dtype: object

### Human Resources dataset

In [28]:
hr = pd.read_csv('../Cases/human-resources-analytics/HR_comma_sep.csv')
hr.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.1,0.77,6,247,4,0,1,0,sales,low
3,0.92,0.85,5,259,5,0,1,0,sales,low
4,0.89,1.0,5,224,5,0,1,0,sales,low


In [29]:
X, y = kyphosis.drop('Kyphosis', axis=1), kyphosis['Kyphosis']
X = pd.get_dummies(X)

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=24, stratify=y)

### Standard Scaling

In [36]:
from sklearn.pipeline import Pipeline
from tqdm import tqdm
import numpy as np
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

penalties = ['l2', 'l1', 'elasticnet', None]
scores = []
l1_ratios = np.linspace(0.001, 1, 5)
for i in tqdm(range(len(penalties))):
    for r in l1_ratios:
        # lr = LogisticRegression(penalty=penalties[i]) Solver lbfgs (is the default solver) supports only 'l2' or None penalties, got l1 penalty.
        lr = LogisticRegression(solver='saga', penalty=penalties[i], l1_ratio=r)
        pipe = Pipeline([('scl', scaler), ('lr', lr)])
        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)
        scores.append([penalties[i], r, accuracy_score(y_test, y_pred)])

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 43.12it/s]


In [32]:
df_scores = pd.DataFrame(scores, columns=['penalties', 'l1_ratio', 'accuracy'])
df_scores.sort_values('accuracy').iloc[0]

penalties       l2
l1_ratio     0.001
accuracy      0.76
Name: 0, dtype: object

### MinMax Scaling

In [35]:
from sklearn.pipeline import Pipeline
from tqdm import tqdm
import numpy as np
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

penalties = ['l2', 'l1', 'elasticnet', None]
scores = []
l1_ratios = np.linspace(0.001, 1, 5)
for i in tqdm(range(len(penalties))):
    for r in l1_ratios:
        # lr = LogisticRegression(penalty=penalties[i]) Solver lbfgs (is the default solver) supports only 'l2' or None penalties, got l1 penalty.
        lr = LogisticRegression(solver='saga', penalty=penalties[i], l1_ratio=r)
        pipe = Pipeline([('scl', scaler), ('lr', lr)])
        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)
        scores.append([penalties[i], r, accuracy_score(y_test, y_pred)])

df_scores = pd.DataFrame(scores, columns=['penalties', 'l1_ratio', 'accuracy'])
df_scores.sort_values('accuracy').iloc[0]

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 43.99it/s]


penalties      l1
l1_ratio      1.0
accuracy     0.72
Name: 9, dtype: object

#### Glass Identification dataset

In [43]:
glass = pd.read_csv('../Cases/Glass Identification/Glass.csv')
glass.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,building_windows_float_processed
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,building_windows_float_processed
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,building_windows_float_processed
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,building_windows_float_processed
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,building_windows_float_processed


In [47]:
X, y= glass.drop('Type', axis=1), glass['Type']
# y = pd.get_dummies(y)

In [48]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=24)

In [55]:
import warnings
warnings.filterwarnings('ignore')

#### MinMax Scaler

In [56]:
from sklearn.pipeline import Pipeline
from tqdm import tqdm
import numpy as np
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
multi_class = ['ovr', 'multinomial']
penalties = ['l2', 'l1', 'elasticnet', None]
scores = []
l1_ratios = np.linspace(0.001, 1, 5)
for i in tqdm(range(len(penalties))):
    for r in l1_ratios:
        for m in multi_class:
            # lr = LogisticRegression(penalty=penalties[i]) Solver lbfgs (is the default solver) supports only 'l2' or None penalties, got l1 penalty.
            
            lr = LogisticRegression(solver='saga', penalty=penalties[i], l1_ratio=r, multi_class=m, random_state=24)
            pipe = Pipeline([('scl', scaler), ('lr', lr)])
            pipe.fit(X_train, y_train)
            y_pred = pipe.predict(X_test)
            scores.append([penalties[i], r, m, accuracy_score(y_test, y_pred)])

df_scores = pd.DataFrame(scores, columns=['penalties', 'l1_ratio', 'multi-class', 'accuracy'])
df_scores.sort_values('accuracy', ascending=False).iloc[0]

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00,  6.59it/s]


penalties             None
l1_ratio               1.0
multi-class    multinomial
accuracy               0.6
Name: 39, dtype: object

In [57]:
from sklearn.pipeline import Pipeline
from tqdm import tqdm
import numpy as np
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
multi_class = ['ovr', 'multinomial']
penalties = ['l2', 'l1', 'elasticnet', None]
scores = []
l1_ratios = np.linspace(0.001, 1, 5)
for i in tqdm(range(len(penalties))):
    for r in l1_ratios:
        for m in multi_class:
            # lr = LogisticRegression(penalty=penalties[i]) Solver lbfgs (is the default solver) supports only 'l2' or None penalties, got l1 penalty.
            
            lr = LogisticRegression(solver='saga', penalty=penalties[i], l1_ratio=r, multi_class=m, random_state=24)
            pipe = Pipeline([('scl', scaler), ('lr', lr)])
            pipe.fit(X_train, y_train)
            y_pred = pipe.predict(X_test)
            scores.append([penalties[i], r, m, accuracy_score(y_test, y_pred)])

df_scores = pd.DataFrame(scores, columns=['penalties', 'l1_ratio', 'multi-class', 'accuracy'])
df_scores.sort_values('accuracy', ascending=False).iloc[0]

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00,  4.70it/s]


penalties             None
l1_ratio               1.0
multi-class    multinomial
accuracy               0.6
Name: 39, dtype: object

### Image Segmentation

In [64]:
image = pd.read_csv('../Cases/Image Segmentation/Image_Segmentation.csv')
image.head()

Unnamed: 0,Class,region.centroid.col,region.centroid.row,region.pixel.count,short.line.density.5,short.line.density.2,vedge.mean,vegde.sd,hedge.mean,hedge.sd,intensity.mean,rawred.mean,rawblue.mean,rawgreen.mean,exred.mean,exblue.mean,exgreen.mean,value.mean,saturation.mean,hue-mean
0,BRICKFACE,188,133,9,0.0,0.0,0.333333,0.266667,0.5,0.077778,6.666666,8.333334,7.777778,3.888889,5.0,3.333333,-8.333333,8.444445,0.53858,-0.924817
1,BRICKFACE,105,139,9,0.0,0.0,0.277778,0.107407,0.833333,0.522222,6.111111,7.555555,7.222222,3.555556,4.333334,3.333333,-7.666666,7.555555,0.532628,-0.965946
2,BRICKFACE,34,137,9,0.0,0.0,0.5,0.166667,1.111111,0.474074,5.851852,7.777778,6.444445,3.333333,5.777778,1.777778,-7.555555,7.777778,0.573633,-0.744272
3,BRICKFACE,39,111,9,0.0,0.0,0.722222,0.374074,0.888889,0.429629,6.037037,7.0,7.666666,3.444444,2.888889,4.888889,-7.777778,7.888889,0.562919,-1.175773
4,BRICKFACE,16,128,9,0.0,0.0,0.5,0.077778,0.666667,0.311111,5.555555,6.888889,6.666666,3.111111,4.0,3.333333,-7.333334,7.111111,0.561508,-0.985811


In [66]:
X, y= image.drop('Class', axis=1), image['Class']

In [67]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=24)

### MinMaxScaler

In [68]:
from sklearn.pipeline import Pipeline
from tqdm import tqdm
import numpy as np
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
multi_class = ['ovr', 'multinomial']
penalties = ['l2', 'l1', 'elasticnet', None]
scores = []
l1_ratios = np.linspace(0.001, 1, 5)
for i in tqdm(range(len(penalties))):
    for r in l1_ratios:
        for m in multi_class:
            # lr = LogisticRegression(penalty=penalties[i]) Solver lbfgs (is the default solver) supports only 'l2' or None penalties, got l1 penalty.
            
            lr = LogisticRegression(solver='saga', penalty=penalties[i], l1_ratio=r, multi_class=m, random_state=24)
            pipe = Pipeline([('scl', scaler), ('lr', lr)])
            pipe.fit(X_train, y_train)
            y_pred = pipe.predict(X_test)
            scores.append([penalties[i], r, m, accuracy_score(y_test, y_pred)])

df_scores = pd.DataFrame(scores, columns=['penalties', 'l1_ratio', 'multi-class', 'accuracy'])
df_scores.sort_values('accuracy', ascending=False).iloc[0]

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  3.24it/s]


penalties          None
l1_ratio            1.0
multi-class         ovr
accuracy       0.904762
Name: 38, dtype: object

### Standard Scaler

In [69]:
from sklearn.pipeline import Pipeline
from tqdm import tqdm
import numpy as np
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
multi_class = ['ovr', 'multinomial']
penalties = ['l2', 'l1', 'elasticnet', None]
scores = []
l1_ratios = np.linspace(0.001, 1, 5)
for i in tqdm(range(len(penalties))):
    for r in l1_ratios:
        for m in multi_class:
            # lr = LogisticRegression(penalty=penalties[i]) Solver lbfgs (is the default solver) supports only 'l2' or None penalties, got l1 penalty.
            
            lr = LogisticRegression(solver='saga', penalty=penalties[i], l1_ratio=r, multi_class=m, random_state=24)
            pipe = Pipeline([('scl', scaler), ('lr', lr)])
            pipe.fit(X_train, y_train)
            y_pred = pipe.predict(X_test)
            scores.append([penalties[i], r, m, accuracy_score(y_test, y_pred)])

df_scores = pd.DataFrame(scores, columns=['penalties', 'l1_ratio', 'multi-class', 'accuracy'])
df_scores.sort_values('accuracy', ascending=False).iloc[0]

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.95it/s]


penalties             None
l1_ratio               1.0
multi-class    multinomial
accuracy          0.904762
Name: 39, dtype: object

### Inferencing

In [72]:
test_image = pd.read_csv('../Cases/Image Segmentation/tst_img.csv')
train = pd.read_csv('../Cases/Image Segmentation/Image_Segmentation.csv')
# test_image.head()

In [92]:
X, y= train.drop('Class', axis=1), train['Class']

In [85]:
lr = LogisticRegression(solver='saga', penalty=None, multi_class='multinomial', random_state=24)
pipe = Pipeline([('scl', scaler), ('lr', lr)])
pipe.fit(X, y)
y_pred = pipe.predict(test_image)

In [86]:
y_pred

array(['SKY', 'PATH', 'GRASS', 'GRASS'], dtype=object)

In [94]:
pred_probs = pipe.predict_proba(test_image)
pd.DataFrame(pred_probs, columns=le.classes_)

Unnamed: 0,BRICKFACE,CEMENT,FOLIAGE,GRASS,PATH,SKY,WINDOW
0,9.1e-05,0.032998,3e-06,4.1e-05,0.018735,0.948132,6.086422e-07
1,0.000875,0.045693,8.7e-05,0.000706,0.948799,0.003232,0.0006088371
2,0.001968,8.5e-05,0.001289,0.973715,0.000479,0.000114,0.02234966
3,0.003356,3.8e-05,0.005753,0.97879,0.000429,6.4e-05,0.01156902


In [82]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le_y = le.fit_transform(y)

In [83]:
le_y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3])

In [84]:
le.classes_

array(['BRICKFACE', 'CEMENT', 'FOLIAGE', 'GRASS', 'PATH', 'SKY', 'WINDOW'],
      dtype=object)

In [87]:
scaler = StandardScaler()
lr = LogisticRegression(solver='saga', penalty=None, multi_class='multinomial', random_state=24)
pipe = Pipeline([('scl', scaler), ('lr', lr)])
pipe.fit(X, le_y)
y_pred = pipe.predict(test_image)

In [88]:
y_pred

array([5, 4, 3, 3])

In [91]:
le.inverse_transform(y_pred)

array(['SKY', 'PATH', 'GRASS', 'GRASS'], dtype=object)

In [89]:
pd.DataFrame(pred_probs, columns=le.classes_)

Unnamed: 0,BRICKFACE,CEMENT,FOLIAGE,GRASS,PATH,SKY,WINDOW
0,5.4e-05,0.055696,3e-06,3.6e-05,0.006084,0.938125,1.706288e-07
1,0.001005,0.049734,0.000153,0.000867,0.944674,0.002952,0.0006146178
2,0.001428,2.6e-05,0.000983,0.965559,0.000478,3.2e-05,0.03149396
3,0.002905,9e-06,0.004172,0.976141,0.000375,1.6e-05,0.01638095
