# Logistic Regression

In [18]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from sklearn.svm import  SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline

## Kyphosis Dataset

In [2]:
kyp = pd.read_csv("./Cases/Kyphosis/Kyphosis.csv")
kyp.head()

Unnamed: 0,Kyphosis,Age,Number,Start
0,absent,71,3,5
1,absent,158,3,14
2,present,128,4,5
3,absent,2,5,1
4,absent,1,4,15


In [3]:
X = kyp.drop('Kyphosis', axis = 1)
y = kyp['Kyphosis']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=24, stratify=y)

In [4]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.76


### Optimization

In [8]:
# penalities = ['l2','l1','elasticnet', None]
penalities = ['l2', None]
scores = []
for i in range(len(penalities)):
    lr = LogisticRegression(penalty=penalities[i])
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    scores.append(accuracy_score(y_test, y_pred))

i_max = np.argmax(scores)
print("Best k : ", penalities[i_max])
print("Best scores : ", scores[i_max])

Best k :  l2
Best scores :  0.76


In [10]:
penalities = ['l2','l1','elasticnet', None]
l1_ratios = np.linspace(0.001,1,5)
scores = []
for i in range(len(penalities)):
    for r in l1_ratios:
        lr = LogisticRegression(penalty=penalities[i], solver='saga', l1_ratio=r)
        lr.fit(X_train, y_train)
        y_pred = lr.predict(X_test)
        scores.append([penalities[i],r,accuracy_score(y_test, y_pred)])

df = pd.DataFrame(scores, columns=['penalities', 'l1_ratio', 'accuracy'])
df.sort_values('accuracy', ascending=False)
df.head()



Unnamed: 0,penalities,l1_ratio,accuracy
0,l2,0.001,0.8
1,l2,0.25075,0.8
2,l2,0.5005,0.8
3,l2,0.75025,0.8
4,l2,1.0,0.8


## HR Dataset

In [11]:
hr = pd.read_csv("./Cases/human-resources-analytics/HR_comma_sep.csv")
hr.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.1,0.77,6,247,4,0,1,0,sales,low
3,0.92,0.85,5,259,5,0,1,0,sales,low
4,0.89,1.0,5,224,5,0,1,0,sales,low


In [12]:
X = hr.drop("left", axis=1)
y = hr['left']
X_dum = pd.get_dummies(X, drop_first=True)
X_train, X_test, y_train, y_test = train_test_split(X_dum,y, test_size=0.3, random_state=24, stratify=y)

In [14]:
penalities = ['l2','l1','elasticnet', None]
l1_ratios = np.linspace(0.001,1,5)
scores = []
for i in range(len(penalities)):
    for r in l1_ratios:
        lr = LogisticRegression(penalty=penalities[i], solver='saga', l1_ratio=r)
        lr.fit(X_train, y_train)
        y_pred = lr.predict(X_test)
        scores.append([penalities[i],r,accuracy_score(y_test, y_pred)])

df = pd.DataFrame(scores, columns=['penalities', 'l1_ratio', 'accuracy'])
df.sort_values('accuracy', ascending=False)
df



Unnamed: 0,penalities,l1_ratio,accuracy
0,l2,0.001,0.758391
1,l2,0.25075,0.758391
2,l2,0.5005,0.758391
3,l2,0.75025,0.758391
4,l2,1.0,0.758391
5,l1,0.001,0.758391
6,l1,0.25075,0.758391
7,l1,0.5005,0.758391
8,l1,0.75025,0.758391
9,l1,1.0,0.758391


## Glass Dataset

In [24]:
glass = pd.read_csv("./Cases/Glass Identification/Glass.csv")
glass.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,building_windows_float_processed
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,building_windows_float_processed
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,building_windows_float_processed
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,building_windows_float_processed
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,building_windows_float_processed


In [25]:
X = glass.drop('Type', axis=1)
y = glass['Type']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=24, stratify=y)

In [31]:
scaler = StandardScaler()
penalities = ['l2',None,'l1','elasticnet' ]
l1_ratios = np.linspace(0.001,1,5)
multi = ['ovr','multinomial']
scores = []
for i in range(len(penalities)):
    for j in l1_ratios:
        for m in multi:
            lr = LogisticRegression(penalty=penalities[i], random_state=24, solver='saga', l1_ratio=j, multi_class=m)
            lr.fit(X_train, y_train)
            scores.append([penalities[i],j, m, accuracy_score(y_test, y_pred)])

scores_df = pd.DataFrame(scores, columns=['penalities','l1_ratios','multi_class','accuracy'])
scores_df = scores_df.sort_values('accuracy', ascending=False)
scores_df.head()



Unnamed: 0,penalities,l1_ratios,multi_class,accuracy
0,l2,0.001,ovr,0.630769
1,l2,0.001,multinomial,0.630769
22,l1,0.25075,ovr,0.630769
23,l1,0.25075,multinomial,0.630769
24,l1,0.5005,ovr,0.630769


In [32]:
scaler = MinMaxScaler()
penalities = ['l2',None,'l1','elasticnet' ]
l1_ratios = np.linspace(0.001,1,5)
multi = ['ovr','multinomial']
scores = []
for i in range(len(penalities)):
    for j in l1_ratios:
        for m in multi:
            lr = LogisticRegression(penalty=penalities[i], random_state=24, solver='saga', l1_ratio=j, multi_class=m)
            lr.fit(X_train, y_train)
            scores.append([penalities[i],j, m, accuracy_score(y_test, y_pred)])

scores_df = pd.DataFrame(scores, columns=['penalities','l1_ratios','multi_class','accuracy'])
scores_df = scores_df.sort_values('accuracy', ascending=False)
scores_df.head()



Unnamed: 0,penalities,l1_ratios,multi_class,accuracy
0,l2,0.001,ovr,0.630769
1,l2,0.001,multinomial,0.630769
22,l1,0.25075,ovr,0.630769
23,l1,0.25075,multinomial,0.630769
24,l1,0.5005,ovr,0.630769


## Image Segmentation

In [40]:
image_df = pd.read_csv("./Cases/Image Segmentation/Image_Segmentation.csv")
image_df.head()

Unnamed: 0,Class,region.centroid.col,region.centroid.row,region.pixel.count,short.line.density.5,short.line.density.2,vedge.mean,vegde.sd,hedge.mean,hedge.sd,intensity.mean,rawred.mean,rawblue.mean,rawgreen.mean,exred.mean,exblue.mean,exgreen.mean,value.mean,saturation.mean,hue-mean
0,BRICKFACE,188,133,9,0.0,0.0,0.333333,0.266667,0.5,0.077778,6.666666,8.333334,7.777778,3.888889,5.0,3.333333,-8.333333,8.444445,0.53858,-0.924817
1,BRICKFACE,105,139,9,0.0,0.0,0.277778,0.107407,0.833333,0.522222,6.111111,7.555555,7.222222,3.555556,4.333334,3.333333,-7.666666,7.555555,0.532628,-0.965946
2,BRICKFACE,34,137,9,0.0,0.0,0.5,0.166667,1.111111,0.474074,5.851852,7.777778,6.444445,3.333333,5.777778,1.777778,-7.555555,7.777778,0.573633,-0.744272
3,BRICKFACE,39,111,9,0.0,0.0,0.722222,0.374074,0.888889,0.429629,6.037037,7.0,7.666666,3.444444,2.888889,4.888889,-7.777778,7.888889,0.562919,-1.175773
4,BRICKFACE,16,128,9,0.0,0.0,0.5,0.077778,0.666667,0.311111,5.555555,6.888889,6.666666,3.111111,4.0,3.333333,-7.333334,7.111111,0.561508,-0.985811


In [41]:
X = image_df.drop('Class', axis=1)
y = image_df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=24, stratify=y)

In [42]:
scaler = StandardScaler()
penalities = ['l2',None,'l1','elasticnet' ]
l1_ratios = np.linspace(0.001,1,5)
multi = ['ovr','multinomial']
scores = []
for i in range(len(penalities)):
    for j in l1_ratios:
        for m in multi:
            lr = LogisticRegression(penalty=penalities[i], random_state=24, solver='saga', l1_ratio=j, multi_class=m)
            pipe = Pipeline([('SCL', scaler),('LR',lr)])
            pipe.fit(X_train, y_train)
            y_pred = pipe.predict(X_test)
            scores.append([penalities[i],j, m, accuracy_score(y_test, y_pred)])
            

scores_df = pd.DataFrame(scores, columns=['penalities','l1_ratios','multi_class','accuracy'])
scores_df = scores_df.sort_values('accuracy', ascending=False)
scores_df.head()



Unnamed: 0,penalities,l1_ratios,multi_class,accuracy
16,,0.75025,ovr,0.873016
10,,0.001,ovr,0.873016
12,,0.25075,ovr,0.873016
18,,1.0,ovr,0.873016
14,,0.5005,ovr,0.873016


In [43]:
tst =  pd.read_csv("./Cases/Image Segmentation/tst_img.csv")
tst.head()

Unnamed: 0,region.centroid.col,region.centroid.row,region.pixel.count,short.line.density.5,short.line.density.2,vedge.mean,vegde.sd,hedge.mean,hedge.sd,intensity.mean,rawred.mean,rawblue.mean,rawgreen.mean,exred.mean,exblue.mean,exgreen.mean,value.mean,saturation.mean,hue-mean
0,22,90,10,0,0,0.666668,0.044444,0.88,0.562963,112.0,105.888885,128.55556,106.0,-22.777779,45.22222,-22.444445,128.55556,0.179697,-2.097815
1,210,200,9,0,0,1.3,0.998145,1.611111,1.123816,49.48148,45.0,60.666668,43.0,-14.111111,35.0,-19.444445,60.666668,0.290788,-1.987599
2,240,184,9,0,0,0.5,0.077778,0.777778,0.785185,11.851851,9.777778,9.888889,15.888889,-5.0,-5.888889,13.0,15.888889,0.5,2.128646
3,130,191,9,0,0,1.0,0.4,1.5,1.011111,7.333334,5.333334,5.0,11.222222,-7.0,-5.666666,11.666667,11.222222,0.53582,2.122422


In [59]:
scaler = StandardScaler()

lr = LogisticRegression(penalty=None, random_state=24, solver='saga', multi_class='ovr')
pipe = Pipeline([('SCL', scaler),('LR',lr)])
pipe.fit(X, y)
pred_probs = pipe.predict_proba(X_test)
pred_probs = pd.DataFrame(pred_probs, columns=['BRICKFACE','CEMENT','FOLIAGE','GRASS','PATH','SKY','WINDOW'])




In [60]:
pred_probs

Unnamed: 0,BRICKFACE,CEMENT,FOLIAGE,GRASS,PATH,SKY,WINDOW
0,0.012837,0.489786,0.004592,0.004331,0.228582,0.050696,0.209176
1,0.005731,0.159877,0.002541,0.010171,0.776089,0.015913,0.029679
2,0.005177,0.000223,0.030993,0.915122,0.041071,0.006545,0.000868
3,0.000057,0.051811,0.031936,0.012606,0.003624,0.896067,0.003900
4,0.143994,0.004709,0.712012,0.007496,0.000910,0.005929,0.124950
...,...,...,...,...,...,...,...
58,0.003068,0.112414,0.005723,0.004674,0.015378,0.857943,0.000800
59,0.008970,0.010615,0.080632,0.008857,0.883571,0.007021,0.000334
60,0.036733,0.311324,0.028220,0.009686,0.042226,0.022551,0.549262
61,0.007514,0.000936,0.005849,0.905988,0.005908,0.004972,0.068834


In [53]:
pipe.predict(tst)

array(['SKY', 'PATH', 'GRASS', 'GRASS'], dtype=object)

### Label Encoder on y

In [47]:
from sklearn.preprocessing import LabelEncoder

In [48]:
le = LabelEncoder()
le_y = le.fit_transform(y)


In [49]:
scaler = StandardScaler()

lr = LogisticRegression(penalty=None, random_state=24, solver='saga', multi_class='ovr')
pipe = Pipeline([('SCL', scaler),('LR',lr)])
pipe.fit(X, le_y)



In [50]:
pred_probs = pipe.predict_proba(tst)
pd.DataFrame(pred_probs, columns=list(le.classes_))

Unnamed: 0,BRICKFACE,CEMENT,FOLIAGE,GRASS,PATH,SKY,WINDOW
0,0.031771,0.069481,0.002474,0.00537,0.096824,0.793955,0.000125
1,0.01671,0.150273,0.00187,0.008592,0.792422,0.015743,0.014389
2,0.007717,0.000943,0.003503,0.904994,0.004034,0.008488,0.07032
3,0.015412,0.00045,0.020215,0.927095,0.004215,0.005753,0.026861
