In [None]:
import pandas as pd
import numpy as np

# Load the CSV data
df = pd.read_csv("oxford_vocabulary.csv")  # Path to dataset
df = df.drop(columns=['link'])
df.head()


Unnamed: 0,word,topic,subtopic,subsubtopic,CEFR_level,word_class
0,aardvark,Animals,Animals,wild_mammals,c2,noun
1,adder,Animals,Animals,amphibians_and_reptiles,c2,noun
2,Afghan hound,Animals,Animals,dogs,c2,noun
3,alligator,Animals,Animals,amphibians_and_reptiles,c1,noun
4,alpaca,Animals,Animals,farm_animals,c2,noun


In [None]:
CEFR_LEVELS = {'a1': 1, 'a2': 2, 'b1': 3, 'b2': 4, 'c1': 5, 'c2': 6}

In [None]:
class VocabularyRecommender:
    def __init__(self, dataset):
        self.dataset = dataset
        self.dataset['cefr_num'] = self.dataset['CEFR_level'].str.lower().map(CEFR_LEVELS)

    def get_recommendations(self, known_words, num_recommendations=10, max_per_subtopic=2):
        # Фильтрация известных слов (с обработкой NaN)
        known_data = self.dataset[self.dataset['word'].isin(known_words)].dropna(subset=['word'])

        if known_data.empty:
            return self.dataset.sample(min(num_recommendations, len(self.dataset))[['word', 'topic', 'subtopic', 'CEFR_level']])

        max_level = known_data['cefr_num'].max()
        top_themes = known_data['topic'].value_counts().index.tolist()
        top_subthemes = known_data['subtopic'].value_counts().index.tolist()
        top_subsubthemes = known_data['subsubtopic'].value_counts().index.tolist()

        # Фильтрация новых слов (корректная обработка исключения известных)
        new_words = self.dataset[~self.dataset['word'].isin(known_words)].copy()
        new_words = new_words.dropna(subset=['word'])  # Удаляем строки с NaN в слове

        # Расчет релевантности
        new_words['relevance'] = 0
        new_words.loc[new_words['topic'].isin(top_themes), 'relevance'] += 1
        new_words.loc[new_words['subtopic'].isin(top_subthemes), 'relevance'] += 2
        new_words.loc[new_words['subsubtopic'].isin(top_subsubthemes), 'relevance'] += 3

        new_words['level_diff'] = abs(new_words['cefr_num'] - max_level)
        new_words['relevance'] += (3 - np.clip(new_words['level_diff'], 0, 3))

        # Сортировка по релевантности и уровню
        sorted_words = new_words.sort_values(['relevance', 'cefr_num'], ascending=[False, True])

        # Отбор с учетом разнообразия подтем
        final_recommendations = []
        subtopic_counts = {}

        for _, row in sorted_words.iterrows():
            subtopic = row['subtopic']

            if subtopic not in subtopic_counts or subtopic_counts[subtopic] < max_per_subtopic:
                final_recommendations.append(row)
                subtopic_counts[subtopic] = subtopic_counts.get(subtopic, 0) + 1

                if len(final_recommendations) >= num_recommendations:
                    break

        # Если не набрали нужное количество - добавляем наиболее релевантные
        if len(final_recommendations) < num_recommendations:
            remaining = num_recommendations - len(final_recommendations)
            extra = sorted_words[~sorted_words.index.isin([r.name for r in final_recommendations])]
            final_recommendations.extend(extra.head(remaining).to_dict('records'))

        # Формируем итоговый DataFrame
        result = pd.DataFrame(final_recommendations)[['word', 'topic', 'subtopic', 'CEFR_level']]
        return result.dropna(subset=['word'])  # На всякий случай еще раз удаляем NaN

In [None]:
rec = VocabularyRecommender(df)

In [None]:
rec.get_recommendations(['apple', 'bakery', 'pie'], 5)

Unnamed: 0,word,topic,subtopic,CEFR_level
6708,cherry,Food and drink,Food,b2
6714,chicken rice,Food and drink,Food,b2
12395,antique,Leisure,Shopping,b2
12404,bakeshop,Leisure,Shopping,b2
5888,afternoon tea,Food and drink,Cooking and eating,b2


In [None]:
knowledge = ['apple', 'pie', 'cook']
df['cefr_num'] = df['CEFR_level'].map(CEFR_LEVELS)
known = df[df['word'].isin(knowledge)]
known

Unnamed: 0,word,topic,subtopic,subsubtopic,CEFR_level,word_class,cefr_num
5973,cook,Food and drink,Cooking and eating,preparing_food,a1,verb,1
5974,cook,Food and drink,Cooking and eating,restaurant_people,a2,noun,2
6570,apple,Food and drink,Food,fruit_and_nuts,a1,noun,1
7111,pie,Food and drink,Food,savoury_dishes,a2,noun,2
28719,cook,Work and business,Jobs,jobs_and_professions,a2,noun,2


In [None]:
df[df['cefr_num'] == 2].sample(5)

Unnamed: 0,word,topic,subtopic,subsubtopic,CEFR_level,word_class,cefr_num
21820,upload,Science and technology,Computers,using_a_computer,a2,verb,2
24941,climate change,The natural world,The environment,climate_change,a2,noun,2
18447,background,Politics and society,People in society,race,a2,noun,2
3984,scene,Culture,Art,describing_art,a2,noun,2
1482,smile,Appearance,Appearance,facial_expressions,a2,verb,2


In [None]:
max_level = known['cefr_num'].max()
user_levels = [k for k, v in CEFR_LEVELS.items() if v <= max_level + 1]
top_themes = known['topic'].value_counts().index.tolist()
top_subthemes = known['subtopic'].value_counts().index.tolist()
top_subsubthemes = known['subsubtopic'].value_counts().index.tolist()
top_subsubthemes

['preparing_food',
 'restaurant_people',
 'fruit_and_nuts',
 'savoury_dishes',
 'jobs_and_professions']

In [None]:
new_words = df[~df.isin(known)]
new_words['relevance'] = 0

In [None]:
new_words.loc[new_words['topic'].isin(top_themes), 'relevance'] += 1
new_words.loc[new_words['subtopic'].isin(top_subthemes), 'relevance'] += 2
new_words.loc[new_words['subsubtopic'].isin(top_subsubthemes), 'relevance'] += 3
new_words['level_diff'] = abs(new_words['cefr_num'] - max_level)

In [None]:
new_words['relevance'] += (3 - np.clip(new_words['level_diff'], 0, 3))

In [None]:
new_words

Unnamed: 0,word,topic,subtopic,subsubtopic,CEFR_level,word_class,cefr_num,relevance,level_diff
0,aardvark,Animals,Animals,wild_mammals,c2,noun,6.0,0.0,4.0
1,adder,Animals,Animals,amphibians_and_reptiles,c2,noun,6.0,0.0,4.0
2,Afghan hound,Animals,Animals,dogs,c2,noun,6.0,0.0,4.0
3,alligator,Animals,Animals,amphibians_and_reptiles,c1,noun,5.0,0.0,3.0
4,alpaca,Animals,Animals,farm_animals,c2,noun,6.0,0.0,4.0
...,...,...,...,...,...,...,...,...,...
29921,workforce,Work and business,Working life,office_life,b2,noun,4.0,2.0,2.0
29922,work from home,Work and business,Working life,describing_work,b1,idiom,3.0,3.0,1.0
29923,workload,Work and business,Working life,describing_work,c1,noun,5.0,1.0,3.0
29924,workplace,Work and business,Working life,office_life,b2,noun,4.0,2.0,2.0


In [None]:
sorted_words = new_words.sort_values(['relevance', 'cefr_num'], ascending=[False, True])
print(sorted_words.head(4))
        # Фильтр для разнообразия подкатегорий
final_recommendations = []
subtopic_counts = {}
max_per_subtopic = 3
num_recommendations = 10
for _, row in sorted_words.iterrows():
    subtopic = row['subtopic']

    # Если подтема не встречалась или не превышен лимит
    if subtopic not in subtopic_counts or subtopic_counts[subtopic] < max_per_subtopic:
        final_recommendations.append(row)
        subtopic_counts[subtopic] = subtopic_counts.get(subtopic, 0) + 1

        if len(final_recommendations) >= num_recommendations:
            break

final_recommendations

      word           topic            subtopic        subsubtopic CEFR_level  \
5925  boil  Food and drink  Cooking and eating     preparing_food         a2   
5956  chef  Food and drink  Cooking and eating  restaurant_people         a2   
6091  heat  Food and drink  Cooking and eating     preparing_food         a2   
6111  kilo  Food and drink  Cooking and eating     preparing_food         a2   

     word_class  cefr_num  relevance  level_diff  
5925       verb       2.0        9.0         0.0  
5956       noun       2.0        9.0         0.0  
6091       verb       2.0        9.0         0.0  
6111       noun       2.0        9.0         0.0  


[word                         boil
 topic              Food and drink
 subtopic       Cooking and eating
 subsubtopic        preparing_food
 CEFR_level                     a2
 word_class                   verb
 cefr_num                      2.0
 relevance                     9.0
 level_diff                    0.0
 Name: 5925, dtype: object,
 word                         chef
 topic              Food and drink
 subtopic       Cooking and eating
 subsubtopic     restaurant_people
 CEFR_level                     a2
 word_class                   noun
 cefr_num                      2.0
 relevance                     9.0
 level_diff                    0.0
 Name: 5956, dtype: object,
 word                         heat
 topic              Food and drink
 subtopic       Cooking and eating
 subsubtopic        preparing_food
 CEFR_level                     a2
 word_class                   verb
 cefr_num                      2.0
 relevance                     9.0
 level_diff                    0.0

In [None]:
# Если не набрали нужное количество - добавляем остальные
if len(final_recommendations) < num_recommendations:
    remaining = num_recommendations - len(final_recommendations)
    extra = sorted_words[~sorted_words.index.isin([r.name for r in final_recommendations])]
    final_recommendations.extend(extra.head(remaining).to_dict('records'))

pd.DataFrame(final_recommendations)[['word', 'topic', 'subtopic', 'CEFR_level']]

Unnamed: 0,word,topic,subtopic,CEFR_level
5925,boil,Food and drink,Cooking and eating,a2
5956,chef,Food and drink,Cooking and eating,a2
6091,heat,Food and drink,Cooking and eating,a2
6847,fish and chips,Food and drink,Food,a2
6931,hot dog,Food and drink,Food,a2
6974,lemon,Food and drink,Food,a2
28628,architect,Work and business,Jobs,a2
28666,builder,Work and business,Jobs,a2
28668,businessman,Work and business,Jobs,a2
6409,bar,Food and drink,Drinks,a2
