In [None]:
!pip install pandas
!pip install git+https://github.com/modAL-python/modAL.git
!pip install sklearn
!pip install plotly
!pip install huggingface_hub

In [307]:
from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling
import fasttext
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from huggingface_hub import hf_hub_download
import pandas as pd
import numpy as np
import plotly.graph_objs as go
from plotly.subplots import make_subplots

In [2]:
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,Name,Gender,Count,Probability
0,James,M,5304407,0.014517
1,John,M,5260831,0.014398
2,Robert,M,4970386,0.013603
3,Michael,M,4579950,0.012534
4,William,M,4226608,0.011567


In [3]:
data.Gender = data.Gender.apply(lambda x: 1 if x=='M' else 0)

In [13]:
model_path = hf_hub_download(repo_id="facebook/fasttext-ru-vectors", filename="model.bin")
model_fasttext = fasttext.load_model(model_path)

In [14]:
#model_fasttext = fasttext.load_model('D:\\Katerina\\NLP\\lab1\\model.bin')



In [15]:
data['embeddings'] = [model_fasttext.get_word_vector(text) for text in data.Name]

In [244]:
X_train, X_test, y_train, y_test = train_test_split(data.embeddings, data.Gender, test_size=0.2, random_state=42)

In [270]:
classifier = LogisticRegression(random_state=42)
random_indices = np.random.choice(X_train.index, size=100, replace=False)
X_init = X_train.loc[random_indices].copy()
y_init = y_train.loc[random_indices].copy()

X_pool = X_train.drop(index=random_indices).copy()
y_pool = y_train.drop(index=random_indices).copy()

learner = ActiveLearner(
    estimator=classifier,
    query_strategy=uncertainty_sampling,
    X_training=X_init.values.tolist(),
    y_training=y_init.values.tolist()
)

In [305]:
accuracy_list = []
for _ in range(10):
    query_idx, query_instance = learner.query(X_pool.values.tolist())
    current_name = data['Name'].iloc[query_idx].values[0]

    label = int(input(f"Current name: {current_name}. Male - 1, Female - 0: "))
    learner.teach(np.array(query_instance).reshape(1, -1), np.array([label]))

    X_pool = X_pool.drop(index=query_idx, errors='ignore').reset_index(drop=True)
    y_pool = y_pool.drop(index=query_idx, errors='ignore').reset_index(drop=True)
    accuracy_list.append(learner.score(X_test.tolist(), y_test))
    
    fig = make_subplots(rows=1, cols=1)
    iterations = list(range(1, len(accuracy_list) + 1))

    fig.add_trace(go.Scatter(x=iterations, y=accuracy_list, mode='lines+markers', name='Accuracy'))

    fig.update_layout(title='Accuracy Over Iterations',
                      xaxis_title='Iteration',
                      yaxis_title='Accuracy',
                      showlegend=True)

    fig.show()

accuracy = learner.score(X_test.tolist(), y_test)
print(f"Final accuracy on the test set: {accuracy:.2f}")

Final accuracy on the test set: 0.59
