<a href="https://colab.research.google.com/github/Jolanta27/AI-course/blob/master/predict_personality_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
from google.colab import files
uploaded = files.upload()

Saving data.csv to data (1).csv


In [6]:
!pip install mlflow pandas numpy scikit-learn



In [7]:
import mlflow
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

df = pd.read_csv("data.csv")

df.head(5)

Unnamed: 0,Age,Gender,Education,Introversion Score,Sensing Score,Thinking Score,Judging Score,Interest,Personality
0,19.0,Male,0,9.4708,7.141434,6.03696,4.360278,Unknown,ENFP
1,27.0,Female,0,5.85392,6.160195,0.80552,4.221421,Sports,ESFP
2,21.0,Female,0,7.08615,3.388433,2.66188,5.12732,Unknown,ENFP
3,28.0,Male,0,2.01892,4.823624,7.30625,5.98655,Others,INTP
4,36.0,Female,1,9.91703,4.75508,5.31469,4.677213,Technology,ENFP


In [8]:
features = ["Age", "Gender", "Education", "Introversion Score", "Sensing Score", "Thinking Score", "Judging Score", "Interest", "Personality"]

In [9]:
def preprocess_data(df):
    # Podział na kolumny numeryczne i kategoryczne
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    categorical_columns = df.select_dtypes(exclude=[np.number]).columns

    # Imputacja dla kolumn numerycznych
    numeric_imputer = SimpleImputer(strategy='mean')
    df[numeric_columns] = numeric_imputer.fit_transform(df[numeric_columns])

    # Imputacja dla kolumn kategorycznych
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    df[categorical_columns] = categorical_imputer.fit_transform(df[categorical_columns])

    # Skalowanie kolumn numerycznych
    scaler = StandardScaler()
    df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

    return df, numeric_imputer, categorical_imputer, scaler

In [10]:
mlflow.set_experiment("Personality Test Prediction")

2024/09/20 08:14:31 INFO mlflow.tracking.fluent: Experiment with name 'Personality Test Prediction' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///content/mlruns/666557054462622957', creation_time=1726820071179, experiment_id='666557054462622957', last_update_time=1726820071179, lifecycle_stage='active', name='Personality Test Prediction', tags={}>

In [13]:
df_processed = df

mlflow.set_experiment("Personality_Data_Preprocessing")


with mlflow.start_run():

    mlflow.log_param("numeric_columns", list(df_processed.select_dtypes(include=['float64']).columns))
    mlflow.log_param("categorical_columns", list(df_processed.select_dtypes(include=['object']).columns))


    mlflow.log_param("imputation_method_numeric", "mean")
    mlflow.log_param("imputation_method_categorical", "most_frequent")
    mlflow.log_param("scaling_method", "StandardScaler")

    mlflow.log_metric("num_samples", len(df_processed))
    mlflow.log_metric("num_features", df_processed.shape[1])

    df_processed.to_csv("processed_personality_data.csv", index=False)
    mlflow.log_artifact("processed_personality_data.csv", "processed_data")

    print("Preprocessing zalogowany w MLflow.")

print(df_processed.head())

Preprocessing zalogowany w MLflow.
    Age  Gender  Education  Introversion Score  Sensing Score  Thinking Score  \
0  19.0    Male          0             9.47080       7.141434         6.03696   
1  27.0  Female          0             5.85392       6.160195         0.80552   
2  21.0  Female          0             7.08615       3.388433         2.66188   
3  28.0    Male          0             2.01892       4.823624         7.30625   
4  36.0  Female          1             9.91703       4.755080         5.31469   

   Judging Score    Interest Personality  
0       4.360278     Unknown        ENFP  
1       4.221421      Sports        ESFP  
2       5.127320     Unknown        ENFP  
3       5.986550      Others        INTP  
4       4.677213  Technology        ENFP  


In [14]:
!pip install mlflow
!pip install pyngrok

import os
from pyngrok import ngrok

os.environ["NGROK_AUTH_TOKEN"] = "2mIYfqd3jwnV9ebGKzDPZiRAFhX_2QEpCr322T4jbwBxZ3Snn"

!ngrok authtoken $NGROK_AUTH_TOKEN

get_ipython().system_raw("mlflow ui --port 5000 &")

public_url = ngrok.connect(5000, "http")
print(f"MLflow UI is running at: {public_url}")

Collecting pyngrok
  Downloading pyngrok-7.2.0-py3-none-any.whl.metadata (7.4 kB)
Downloading pyngrok-7.2.0-py3-none-any.whl (22 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.0
Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml
MLflow UI is running at: NgrokTunnel: "https://53f0-34-125-163-135.ngrok-free.app" -> "http://localhost:5000"


In [15]:
!pip install tensorflow torch torchvision



In [16]:
import mlflow.keras
import mlflow.pytorch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout

X = df.drop(columns=["Personality"])
y = df["Personality"]

X = pd.get_dummies(X, columns=['Gender', 'Education', 'Interest'])

le = LabelEncoder()
y_encoded = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

print("Kształt X_train:", X_train.shape)
print("Kształt y_train:", y_train.shape)
print("Unikalne etykiety:", np.unique(y_train))




Kształt X_train: (102448, 14)
Kształt y_train: (102448,)
Unikalne etykiety: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]


In [17]:
def create_model(input_shape, num_classes):
    model = Sequential([
        Dense(128, activation='relu', input_shape=(input_shape,)),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.3),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

num_classes = len(np.unique(y_encoded))
model = create_model(X_train.shape[1], num_classes)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [31]:
!pip install --upgrade mlflow



In [32]:
from mlflow.tracking import MlflowClient

mlflow.set_experiment("Personality Type Prediction")

with mlflow.start_run() as run:
  history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32, verbose=1)

  test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)

  mlflow.log_param("model_architecture", "3 layers")
  mlflow.log_param("dropout_rate", 0.3)
  mlflow.log_metric("test_accuracy", test_accuracy)

  mlflow.keras.log_model(model, "model")

  run_id = run.info.run_id

model_uri = f"runs:/{run_id}/model"
model_name = "PersonalityTypePredictionModel"
registered_model = mlflow.register_model(model_uri, model_name)

print(f"Model registered with name: {model_name} and version: {registered_model.version}")

client = MlflowClient()

try:
    registered_models = client.list_registered_models()
except AttributeError:
    registered_models = client.search_registered_models()

for model in registered_models:
    print(f"Model Name: {model.name}")
    for version in model.latest_versions:
        print(f" - Version: {version.version}, Stage: {version.current_stage}, Run ID: {version.run_id}")

model_versions = client.get_latest_versions(model_name)
for version in model_versions:
    print(f"Version: {version.version}, Stage: {version.current_stage}, Run ID: {version.run_id}")

model_version_details = client.get_model_version(name=model_name, version=registered_model.version)
print(f"Model version details: {model_version_details}")

print(f"Test accuracy: {test_accuracy:.4f}")

Epoch 1/10
[1m3202/3202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.8745 - loss: 0.2735 - val_accuracy: 0.8908 - val_loss: 0.2375
Epoch 2/10
[1m3202/3202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.8750 - loss: 0.2720 - val_accuracy: 0.8904 - val_loss: 0.2289
Epoch 3/10
[1m3202/3202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2ms/step - accuracy: 0.8770 - loss: 0.2714 - val_accuracy: 0.8911 - val_loss: 0.2276
Epoch 4/10
[1m3202/3202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - accuracy: 0.8765 - loss: 0.2706 - val_accuracy: 0.8968 - val_loss: 0.2257
Epoch 5/10
[1m3202/3202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2ms/step - accuracy: 0.8739 - loss: 0.2774 - val_accuracy: 0.8911 - val_loss: 0.2338
Epoch 6/10
[1m3202/3202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - accuracy: 0.8765 - loss: 0.2728 - val_accuracy: 0.8704 - val_loss: 0.2619
Epoch 7/10
[



Model registered with name: PersonalityTypePredictionModel and version: 5
Model Name: PersonalityTypePredictionModel
 - Version: 5, Stage: None, Run ID: 884bd82f4db24c07a41010247c8d4e3f
Version: 5, Stage: None, Run ID: 884bd82f4db24c07a41010247c8d4e3f
Model version details: <ModelVersion: aliases=[], creation_timestamp=1726824500899, current_stage='None', description=None, last_updated_timestamp=1726824500899, name='PersonalityTypePredictionModel', run_id='884bd82f4db24c07a41010247c8d4e3f', run_link=None, source='file:///content/mlruns/367984304429846360/884bd82f4db24c07a41010247c8d4e3f/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=5>
Test accuracy: 0.8880


Registered model 'PersonalityTypePredictionModel' already exists. Creating a new version of this model...
Created version '5' of model 'PersonalityTypePredictionModel'.
  model_versions = client.get_latest_versions(model_name)


Model osiągnął bardzo dobrą dokładność zarówno na zbiorze treningowym jak i walidacyjnym. Końcowa dokładność na zbiorze testowym wynosi 90%, co jest bardzo dobrym wynikiem. Dokładność na zbiorze walidacyjnym (val_accuracy) jest konsekwentnie nieco wyższa niż na zbiorze treningowym (accuracy) co jest pozytywnym znakiem. Model dobrze generalizuje. Model dobrze radzi sobie z zadaniem klasyfikacji typów osobowości.

In [33]:
os.environ["NGROK_AUTH_TOKEN"] = "2mIYfqd3jwnV9ebGKzDPZiRAFhX_2QEpCr322T4jbwBxZ3Snn"

!ngrok authtoken $NGROK_AUTH_TOKEN

get_ipython().system_raw("mlflow ui --port 5000 &")

public_url = ngrok.connect(5000, "http")
print(f"MLflow UI is running at: {public_url}")

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml
MLflow UI is running at: NgrokTunnel: "https://0248-34-125-163-135.ngrok-free.app" -> "http://localhost:5000"
