In [113]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

In [114]:
# instead of importing moduls, let's create our own data, where there will be a column with
# features that are suitable for binary classification, since logistic
# regression is the most advantageous method for solving binary and multi-classification problems
# while linear regression is better at predicting numerical values ​​over a broader
# spectrum


# array with values ​​from 0 to 9 (10 values)


# reshape(-1, 1) reshapes the array so that it contains one column (1) and a number of rows equal to the number of elements in the original array.
# This allows you to convert a one-dimensional array to a multi-dimensional array column.
x = np.arange(10).reshape(-1,1)

In [115]:
# fill the array of labels with zeros and ones in random order
# manually
# y = np.array([1,1,0,0,0,1,1,0,0,1,0,1,0,0,1,1,1,0,0,1])

# or automatically
# y = np.random.randint(2, size=10)

# since in our case the data volume is small, which may negatively affect the accuracy of the model
# you can enter data according to some simple and understandable principle, for example, all even numbers in the x array will correspond
# 0 in the array y, respectively, odd ones will correspond to 1

y = np.array([0, 1, 0, 1, 0, 1, 0, 1, 0, 1])

In [116]:
y

# массивы x и у соответсвуют друг другу 

array([0, 1, 0, 1, 0, 1, 0, 1, 0, 1])

In [117]:
model = LogisticRegression(solver="liblinear", C=10.0, random_state=0)

# solver="liblinear"
# This parameter specifies the optimization algorithm used to solve the optimization problem,
# which occurs when training logistic regression.
# liblinear is a method that works well for small data sets.
#!!!! - There are other options such as lbfgs, newton-cg, sag, and saga.


#C=10.0
# Regularization parameter (denoted as C).
# Regularization is used to prevent model overfitting.
# The value of C is the inverse of the regularization strength.
# Large values ​​of C correspond to weak regularization,
# while small values ​​of C correspond to strong regularization.
# A value of 10.0 indicates relatively weak regularization.

# random_state=0:
# This parameter sets the initial value for the random number generator. This affects the random order,
# where the data is broken down or model weights are initialized.
# Setting random_state to a specific number allows the results to be reproduced when running the code again.
# In this case, setting random_state=0 means
# that the random number generator will start from an initial state of 0.

In [118]:
model.fit(x, y)

In [119]:
model.score(x, y)

0.6

In [120]:
pred = model.predict(x)
print(pred)
print(y)

[0 0 0 0 0 1 1 1 1 1]
[0 1 0 1 0 1 0 1 0 1]


In [121]:
# this code will actually print the probability of each object in array x belonging to the corresponding class in array y
# that is, if we consider the first object of the array "x" - 0, then its membership in class 0 from the array "y" will be equal
# 0.61852578%, and to class 1 - 0.38147422%, which is the correct result since object 0 from array "x" is indeed
# belongs to class 0 of array "y"

# the accuracy of the prediction depends on the accuracy of the model

# in this case we have only two classes in the "y" array - 0 and 1
# and 10 objects in the "x" array


prob_pred = model.predict_proba(x)
print(prob_pred)

[[0.61852578 0.38147422]
 [0.59181906 0.40818094]
 [0.56455889 0.43544111]
 [0.53690276 0.46309724]
 [0.5090177  0.4909823 ]
 [0.48107642 0.51892358]
 [0.45325299 0.54674701]
 [0.4257186  0.5742814 ]
 [0.39863732 0.60136268]
 [0.37216236 0.62783764]]


In [122]:
# Confusion matrix - provides information on how many samples were classified correctly or incorrectly
# for each class.
conf = confusion_matrix(y, pred)
print(conf)

# if the output of the error matrix is ​​like this -

# [[3 2]
#  [1 4]]

# That:

#3 - samples were correctly classified as True Negative.
#4 - samples were correctly classified as True Positive.
#2 - the sample was incorrectly classified as positive (False Positive).
#1 - the sample was incorrectly classified as False Negative.

# if the matrix output is different, just add the resulting values


[[3 2]
 [2 3]]


In [123]:
# These lines are needed to calculate several metrics for assessing the quality of classification
# and provides summary information in a convenient way. Metrics include precision,
# completeness (recall), F1-score (F1-score) and support (support) for each class.

report = classification_report(y, pred)

In [124]:
report

# precision: Precision is the proportion of correctly classified positive samples relative to all samples,
# predicted as positive. For class 0, the accuracy is 0.60, for class 1 - 0.75.

# recall: Recall is the proportion of correctly classified positive samples relative to all actually positive samples.
# For class 0 the completeness is 0.60, for class 1 it is 0.75.

# f1-score: F1-score is the harmonic mean between precision and recall.
# It's a balance between precision and completeness. For class 0, the F1-measure is 0.60, for class 1 - 0.75.

# support: Number of actual samples for each class.

'              precision    recall  f1-score   support\n\n           0       0.60      0.60      0.60         5\n           1       0.60      0.60      0.60         5\n\n    accuracy                           0.60        10\n   macro avg       0.60      0.60      0.60        10\nweighted avg       0.60      0.60      0.60        10\n'

In [125]:
print(report)

              precision    recall  f1-score   support

           0       0.60      0.60      0.60         5
           1       0.60      0.60      0.60         5

    accuracy                           0.60        10
   macro avg       0.60      0.60      0.60        10
weighted avg       0.60      0.60      0.60        10

