In [3]:
import plotly.graph_objects as go
import pandas as pd
import numpy as np
import requests
from typing import List
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [4]:
EXPERIMENTS = 100

In [13]:
def One_Hot_Encoding(df, column_names):
    for column_name in column_names:
        column = df[column_name]
        unique_values = column.unique()
        n_values = len(unique_values)
        one_hot_encoded = np.zeros((len(column), n_values))
        
        for i, value in enumerate(unique_values):
            one_hot_encoded[:, i] = column == value
        
        one_hot_encoded_df = pd.DataFrame(one_hot_encoded, columns=[f"{column_name}_{value}" for value in unique_values])
        
        df = pd.concat([df, one_hot_encoded_df], axis=1).drop(column_name, axis=1)
        df.drop_duplicates()
    return df


def Softmax(data):
  return np.exp(data) / np.sum(np.exp(data), axis=1, keepdims=True)

In [18]:
def data_split(X, y):
  X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    train_size=0.7, 
                                                    shuffle=True)
  return X_train, X_test, y_train, y_test

In [7]:
data = pd.read_csv('titanic_train.csv')
data.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [8]:
without_data = data.columns[data.isnull().any()].tolist()
print(f'Столбцы, в которых нет значений: {without_data}')
for column in without_data:
  mean_value = data[column].mean() if data[column].dtype != object \
                                   else data[column].value_counts().idxmax()
  data[column] = data[column].fillna(mean_value)

data

Столбцы, в которых нет значений: ['Age', 'Cabin', 'Embarked']


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.000000,1,0,A/5 21171,7.2500,B96 B98,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.000000,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.000000,0,0,STON/O2. 3101282,7.9250,B96 B98,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.000000,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.000000,0,0,373450,8.0500,B96 B98,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.000000,0,0,211536,13.0000,B96 B98,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.000000,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,29.699118,1,2,W./C. 6607,23.4500,B96 B98,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.000000,0,0,111369,30.0000,C148,C


In [9]:
Names = set()
for name in data['Name']:
    Names.add(name.split(',')[1].split('.')[0].strip())
print(Names)

{'Col', 'Master', 'Miss', 'Mlle', 'Capt', 'Mr', 'Ms', 'Rev', 'Major', 'Don', 'Jonkheer', 'Lady', 'Dr', 'Mme', 'the Countess', 'Mrs', 'Sir'}


In [10]:
data['Title'] = data['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.000000,1,0,A/5 21171,7.2500,B96 B98,S,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.000000,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.000000,0,0,STON/O2. 3101282,7.9250,B96 B98,S,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.000000,1,0,113803,53.1000,C123,S,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.000000,0,0,373450,8.0500,B96 B98,S,Mr
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.000000,0,0,211536,13.0000,B96 B98,S,Rev
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.000000,0,0,112053,30.0000,B42,S,Miss
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,29.699118,1,2,W./C. 6607,23.4500,B96 B98,S,Miss
889,890,1,1,"Behr, Mr. Karl Howell",male,26.000000,0,0,111369,30.0000,C148,C,Mr


In [11]:
categorical = data[['Pclass', 'Sex', 'Embarked', 'Title']]
categorical

Unnamed: 0,Pclass,Sex,Embarked,Title
0,3,male,S,Mr
1,1,female,C,Mrs
2,3,female,S,Miss
3,1,female,S,Mrs
4,3,male,S,Mr
...,...,...,...,...
886,2,male,S,Rev
887,1,female,S,Miss
888,3,female,S,Miss
889,1,male,C,Mr


In [12]:
numerical = data[['Age', 'Fare', 'SibSp', 'Parch']]
numerical

Unnamed: 0,Age,Fare,SibSp,Parch
0,22.000000,7.2500,1,0
1,38.000000,71.2833,1,0
2,26.000000,7.9250,0,0
3,35.000000,53.1000,1,0
4,35.000000,8.0500,0,0
...,...,...,...,...
886,27.000000,13.0000,0,0
887,19.000000,30.0000,0,0
888,29.699118,23.4500,1,2
889,26.000000,30.0000,0,0


In [14]:
data_encoded = One_Hot_Encoding(
                  df = data, 
                  column_names = ['Sex', 'Embarked', 'Title']
               )
data_encoded

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,...,Title_Mme,Title_Ms,Title_Major,Title_Lady,Title_Sir,Title_Mlle,Title_Col,Title_Capt,Title_the Countess,Title_Jonkheer
0,1,0,3,"Braund, Mr. Owen Harris",22.000000,1,0,A/5 21171,7.2500,B96 B98,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.000000,1,0,PC 17599,71.2833,C85,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,1,3,"Heikkinen, Miss. Laina",26.000000,0,0,STON/O2. 3101282,7.9250,B96 B98,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.000000,1,0,113803,53.1000,C123,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,0,3,"Allen, Mr. William Henry",35.000000,0,0,373450,8.0500,B96 B98,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",27.000000,0,0,211536,13.0000,B96 B98,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
887,888,1,1,"Graham, Miss. Margaret Edith",19.000000,0,0,112053,30.0000,B42,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",29.699118,1,2,W./C. 6607,23.4500,B96 B98,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
889,890,1,1,"Behr, Mr. Karl Howell",26.000000,0,0,111369,30.0000,C148,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
exclude_columns = ['Survived', 'Name', 'Ticket', 'Cabin']
X_columns = data_encoded.columns.difference(exclude_columns)
print(f'Столбцы для обучения: {X_columns.tolist()}') 

X, y = data_encoded[X_columns], data['Survived']

Столбцы для обучения: ['Age', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Fare', 'Parch', 'PassengerId', 'Pclass', 'Sex_female', 'Sex_male', 'SibSp', 'Title_Capt', 'Title_Col', 'Title_Don', 'Title_Dr', 'Title_Jonkheer', 'Title_Lady', 'Title_Major', 'Title_Master', 'Title_Miss', 'Title_Mlle', 'Title_Mme', 'Title_Mr', 'Title_Mrs', 'Title_Ms', 'Title_Rev', 'Title_Sir', 'Title_the Countess']


In [19]:
non_scaled_accuracy = []
for i in range(EXPERIMENTS):
  X_train, X_test, y_train, y_test = data_split(X, y)

  model1 = LogisticRegression()
  model1.fit(X_train, y_train)
  y_logits = model1.predict_log_proba(X_test)
  predicts = np.round(Softmax(y_logits), 2).argmax(axis=1)
  accuracy = accuracy_score(predicts, y_test)
  non_scaled_accuracy.append(accuracy)

In [20]:
pipeline = Pipeline([
  ('scaling', StandardScaler()),
  ('normalizing', MinMaxScaler())
])

X = pipeline.fit_transform(X)

In [22]:
scaled_accuracy = []
for i in range(EXPERIMENTS):
  X_train, X_test, y_train, y_test = data_split(X, y)

  model_2 = LogisticRegression()
  model_2.fit(X_train, y_train)
  y_logits = model_2.predict_log_proba(X_test)
  predicts = np.round(Softmax(y_logits), 2).argmax(axis=1)
  accuracy = accuracy_score(predicts, y_test)
  scaled_accuracy.append(accuracy)

In [23]:
X = numerical.values
numeric_accuracy = []
for i in range(EXPERIMENTS):
  X_train, X_test, y_train, y_test = data_split(X, y)

  model_3 = LogisticRegression()
  model_3.fit(X_train, y_train)
  y_logits = model_3.predict_log_proba(X_test)
  predicts = np.round(Softmax(y_logits), 2).argmax(axis=1)
  accuracy = accuracy_score(predicts, y_test)
  numeric_accuracy.append(accuracy)

In [24]:
data = {
    'Не предобработанные': non_scaled_accuracy,
    'Обработанные': scaled_accuracy,
    'Числовые': numeric_accuracy,
    'x': np.arange(100),
}

results_df = pd.DataFrame(data)
results_df

Unnamed: 0,Не предобработанные,Обработанные,Числовые,x
0,0.817164,0.843284,0.679104,0
1,0.843284,0.798507,0.664179,1
2,0.828358,0.779851,0.694030,2
3,0.809701,0.794776,0.667910,3
4,0.832090,0.809701,0.723881,4
...,...,...,...,...
95,0.809701,0.813433,0.720149,95
96,0.783582,0.798507,0.679104,96
97,0.809701,0.835821,0.682836,97
98,0.861940,0.835821,0.712687,98


In [25]:
fig = go.Figure()

for col in results_df.columns:
    if col != 'x':
        fig.add_trace(go.Scatter(x=results_df['x'], y=results_df[col], name=col))

fig.update_layout(
    xaxis_title='x',
    yaxis_title='accuracy'
)

fig.show()