Обработка данных Titanic dataset с помощью Pandas

In [None]:
!pip install scikit-learn

In [11]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
data = pd.read_csv(url)

# Заполнение пропусков в возрасте медианным значением
data['Age'] = data['Age'].fillna(data['Age'].median())  

# Преобразование 'Sex' and 'Embarked' в числовые значения
data = pd.get_dummies(data, columns=['Sex', 'Embarked'])  

# Стандартизация 'Age' и 'Fare'
scaler = StandardScaler()
data[['Age', 'Fare']] = scaler.fit_transform(data[['Age', 'Fare']])  

print(data.head())

   PassengerId  Survived  Pclass                                                 Name       Age  SibSp  Parch            Ticket      Fare Cabin  Sex_female  Sex_male  Embarked_C  Embarked_Q  Embarked_S
0            1         0       3                              Braund, Mr. Owen Harris -0.565736      1      0         A/5 21171 -0.502445   NaN       False      True       False       False        True
1            2         1       1  Cumings, Mrs. John Bradley (Florence Briggs Thayer)  0.663861      1      0          PC 17599  0.786845   C85        True     False        True       False       False
2            3         1       3                               Heikkinen, Miss. Laina -0.258337      0      0  STON/O2. 3101282 -0.488854   NaN        True     False       False       False        True
3            4         1       1         Futrelle, Mrs. Jacques Heath (Lily May Peel)  0.433312      1      0            113803  0.420730  C123        True     False       False       False   

One - Hot Encoding

In [19]:
import pandas as pd 

data = {'Color': ['Red', 'Blue', 'Green', 'Blue', 'Red']}
df = pd.DataFrame(data)

print(df)

   Color
0    Red
1   Blue
2  Green
3   Blue
4    Red


In [20]:
df_one_hot = pd.get_dummies(df, columns=['Color'])

print(df_one_hot)

   Color_Blue  Color_Green  Color_Red
0       False        False       True
1        True        False      False
2       False         True      False
3        True        False      False
4       False        False       True


Label Encoding

In [31]:
import pandas as pd

data = {'Priority': ['Низкий', 'Средний', 'Высокий', 'Средний', 'Высокий']}
df = pd.DataFrame(data)

print(df)

  Priority
0   Низкий
1  Средний
2  Высокий
3  Средний
4  Высокий


In [32]:
priority_order = ['Низкий', 'Средний', 'Высокий']
df['Priority'] = pd.Categorical(df['Priority'], categories=priority_order, ordered=True)

df['Priority_Encoded'] = df['Priority'].cat.codes

print(df)

  Priority  Priority_Encoded
0   Низкий                 0
1  Средний                 1
2  Высокий                 2
3  Средний                 1
4  Высокий                 2


Пример использования MFlow

In [None]:
!pip install MLflow

In [1]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

data = load_iris()

X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

mlflow.start_run()

model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)

accuracy = model.score(X_test, y_test)

mlflow.log_param("n_estimators", 100)
mlflow.log_metric("accuracy", accuracy)

mlflow.sklearn.log_model(model, "model")

mlflow.end_run()

print(f"\nModel accuracy: {accuracy}")




Model accuracy: 1.0


In [2]:
!mlflow ui

^C


http://127.0.0.1:5000

Версионирование моделей с MLFlow

In [3]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

data = load_iris()

X = data.data
y = data.target


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    "RandomForestClassifier": RandomForestClassifier(n_estimators=100, random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=200, random_state=42)
}

for model_name, model in models.items():
    with mlflow.start_run(run_name=model_name):
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)

        if model_name == "RandomForestClassifier":
            mlflow.log_param("n_estimators", model.n_estimators)
        if model_name == "LogisticRegression":
            mlflow.log_param("max_iter", model.max_iter)

        mlflow.log_metric("accuracy", accuracy)
        mlflow.sklearn.log_model(model, "model")

        print(f"{model_name}: accuracy = {accuracy:.4f}")

for model_name in models.keys():
    run_info = mlflow.search_runs(filter_string=f'tags.mlflow.runName = "{model_name}"')
    last_run_id = run_info["run_id"].iloc[0]
    model_uri = f"runs:/{last_run_id}/model"
    registered_model_name = model_name.replace("Classifier", "").lower()
    mlflow.register_model(model_uri, registered_model_name)

print("\nМодели зарегистрированы в MLflow Registry.")



RandomForestClassifier: accuracy = 1.0000




LogisticRegression: accuracy = 1.0000


Successfully registered model 'randomforest'.
Created version '1' of model 'randomforest'.
Successfully registered model 'logisticregression'.



Модели зарегистрированы в MLflow Registry.


Created version '1' of model 'logisticregression'.


Тестирование с новыми данными

In [6]:
import mlflow.pyfunc
import numpy as np

model_name = "randomforest"
model_version = 1 
loaded_model = mlflow.pyfunc.load_model(model_uri=f"models:/{model_name}/{model_version}")

new_data = np.array([[5.1, 3.5, 1.4, 0.2]]) 
prediction = loaded_model.predict(new_data)
print(f"Предсказание: {prediction}")
if prediction == 0:
    print('Setosa')
elif prediction == 1:
    print('Versicolor')
else:
    print('Virginica')

Предсказание: [0]
Setosa


Практика с Feature Store

In [11]:
import pandas as pd 
from sklearn.model_selection import train_test_split

url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
data = pd.read_csv(url)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [12]:
# Заполнение пропусков в "Age"
data['Age'].fillna(data['Age'].median(), inplace=True) 
# Удаление столбца Cabin с большим количеством пропусков
data.drop(columns=['Cabin'], inplace=True)
# Удаление строк с пропущенными значениями в Embarked
data.dropna(subset=['Embarked'], inplace=True)

In [13]:
data = pd.get_dummies(data, columns=['Sex', 'Embarked'])

data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 889 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  889 non-null    int64  
 1   Survived     889 non-null    int64  
 2   Pclass       889 non-null    int64  
 3   Name         889 non-null    object 
 4   Age          889 non-null    float64
 5   SibSp        889 non-null    int64  
 6   Parch        889 non-null    int64  
 7   Ticket       889 non-null    object 
 8   Fare         889 non-null    float64
 9   Sex_female   889 non-null    bool   
 10  Sex_male     889 non-null    bool   
 11  Embarked_C   889 non-null    bool   
 12  Embarked_Q   889 non-null    bool   
 13  Embarked_S   889 non-null    bool   
dtypes: bool(5), float64(2), int64(5), object(2)
memory usage: 73.8+ KB


In [14]:
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,889.0,889.0,889.0,889.0,889.0,889.0,889.0
mean,446.0,0.382452,2.311586,29.315152,0.524184,0.382452,32.096681
std,256.998173,0.48626,0.8347,12.984932,1.103705,0.806761,49.697504
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,224.0,0.0,2.0,22.0,0.0,0.0,7.8958
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.0,1.0,3.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data[['Age', 'Fare']] = scaler.fit_transform(data[['Age', 'Fare']])

In [16]:
features = data[['Pclass', 'Sex_female', 'Sex_male', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_S', 'Embarked_Q']]
labels = data['Survived']

feature_df = pd.DataFrame({
    'passenger_id': data.index,
    'Pclass': data['Pclass'],
    'Sex_female': data['Sex_female'],
    'Sex_male': data['Sex_male'],
    'Age': data['Age'],
    'SibSp': data['SibSp'],
    'Parch': data['Parch'],
    'Fare': data['Fare'],
    'Embarked_S': data['Embarked_S'],
    'Embarked_Q': data['Embarked_Q']
})

feature_df.to_csv("titanic_features.csv", index=False)