In [31]:
import copy

import pandas
from sklearn import preprocessing
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
filename = "result.csv"
df = pandas.read_csv(filename, sep=";")

In [6]:
__target_columns: list[str] = [
    "stress_level",
    "sleep_well",
    "average_sleep_time",
]

In [7]:
target_df = df[__target_columns]

In [8]:
filtered_df = target_df
filtered_df.loc[df['sleep_well'] == 'Да', 'sleep_well'] = 1
filtered_df.loc[filtered_df['sleep_well'] == 'Нет', 'sleep_well'] = 0

In [192]:
filtered_df = filtered_df.astype(float)
filtered_df

Unnamed: 0,stress_level,sleep_well,average_sleep_time
0,55.0,1.0,8.0
1,70.0,1.0,8.0
2,70.0,1.0,7.0
3,60.0,1.0,6.0
4,90.0,0.0,6.0
5,1.0,1.0,8.0
6,70.0,1.0,6.0
7,45.0,1.0,7.0
8,70.0,0.0,8.0
9,100.0,0.0,6.0


In [10]:
filtered_df.all

<bound method DataFrame.all of     stress_level  sleep_well  average_sleep_time
0           55.0         1.0                 8.0
1           70.0         1.0                 8.0
2           70.0         1.0                 7.0
3           60.0         1.0                 6.0
4           90.0         0.0                 6.0
5            1.0         1.0                 8.0
6           70.0         1.0                 6.0
7           45.0         1.0                 7.0
8           70.0         0.0                 8.0
9          100.0         0.0                 6.0
10          70.0         1.0                 8.0
11          10.0         1.0                 7.0
12          40.0         1.0                 9.0
13          70.0         0.0                 7.0
14          90.0         1.0                 8.0
15          66.0         1.0                 7.0
16          99.0         0.0                 5.0
17          70.0         0.0                 7.0
18          10.0         1.0          

In [11]:
scaler = preprocessing.Normalizer()
scaled_x = scaler.fit_transform(filtered_df.values)
scaled_df = pandas.DataFrame(scaled_x)
scaled_df

Unnamed: 0,0,1,2
0,0.989426,0.01799,0.143917
1,0.993433,0.014192,0.113535
2,0.994937,0.014213,0.099494
3,0.9949,0.016582,0.09949
4,0.997785,0.0,0.066519
5,0.123091,0.123091,0.984732
6,0.996246,0.014232,0.085392
7,0.987878,0.021953,0.15367
8,0.993533,0.0,0.113547
9,0.998205,0.0,0.059892


In [41]:
user_vector = [0.4, 0.2, 0.7]

In [13]:
diffs = cosine_similarity(scaled_x, [user_vector])

In [18]:
result_df = pandas.DataFrame(diffs, columns=['value'])

In [40]:
result_df

Unnamed: 0,value
0,0.602062
1,0.577474
2,0.566371
3,0.56692
4,0.536533
5,0.918745
6,0.555123
7,0.61049
8,0.574115
9,0.53115


In [48]:
joined_result_df = copy.copy(df[["morning_drink"]])
joined_result_df[['similarity_rate']] = result_df

In [49]:
sorted_list = joined_result_df.sort_values(by="similarity_rate", ascending=False)
sorted_list

Unnamed: 0,morning_drink,similarity_rate
5,Чай,0.918745
18,Чай,0.918459
11,Чай,0.894481
29,Кофе,0.770421
30,Чай,0.70039
25,Кофе,0.667897
31,Чай,0.66366
12,Чай,0.660458
19,Чай,0.660433
28,Чай,0.643169


In [50]:
head_neighbours = sorted_list.head(5)
head_neighbours

Unnamed: 0,morning_drink,similarity_rate
5,Чай,0.918745
18,Чай,0.918459
11,Чай,0.894481
29,Кофе,0.770421
30,Чай,0.70039


In [174]:
target_values = head_neighbours['morning_drink'].values
target_values

array(['Чай', 'Чай', 'Чай', 'Кофе', 'Чай'], dtype=object)

In [61]:
m = {}
for val in target_values:
    m[val] = m.get(val, 0) + 1

m

{'Чай': 4, 'Кофе': 1}

In [64]:
predicted_answer = max(m, key=m.get)
predicted_answer

'Чай'

In [66]:
df

Unnamed: 0,gender,morning_drink,age,healthy_life_style,smoking,left_eye_color,stress_level,sleep_well,chronotype,wake_up_time,average_sleep_time,coffee_nearby,gourmet,office_worker,homebody,chronic_diseases?,write_hand,zodiac_sign
0,Женщина,Кофе,22,65,Да,Серо-зеленый,55,Да,Жаворонок,8:00,8.0,Да,Да,Да,Нет,Нет,Правой,Козерог
1,Мужчина,Чай,21,85,Нет,Зеленый,70,Да,Жаворонок,6:00,8.0,Да,Нет,Нет,Нет,Нет,Левой,Стрелец
2,Мужчина,Кофе,22,50,Да,Голубой,70,Да,Сова,9:00,7.0,Да,Да,Нет,Да,Нет,Правой,Рак
3,Мужчина,Кофе,22,80,Да,Карий,60,Да,Сова,9:00,6.0,Да,Да,Нет,Нет,Да,Правой,Скорпион
4,Женщина,Кофе,23,50,Нет,Голубой,90,Нет,Сова,14:00,6.0,Нет,Нет,Нет,Да,Да,Левой,Скорпион
5,Мужчина,Чай,21,99,Нет,Зелёный,1,Да,Жаворонок,6:30,8.0,Нет,Да,Нет,Нет,Нет,Правой,Овен
6,Мужчина,Чай,22,50,Нет,Голубой,70,Да,Сова,10:00,6.0,Нет,Нет,Нет,Нет,Нет,Правой,Рыбы
7,Мужчина,Чай,22,80,Нет,Голубой,45,Да,Сова,7:00,7.0,Да,Нет,Да,Нет,Да,Правой,Водолей
8,Мужчина,Кофе,22,50,Нет,Коричневый,70,Нет,Сова,0:00,8.0,Да,Да,Нет,Да,Нет,Левой,Дева
9,Мужчина,Чай,21,70,Да,Зеленый,100,Нет,Сова,0:30,6.0,Нет,Нет,Нет,Да,Нет,Правой,Водолей


In [81]:
tf = copy.copy(df)
tf

Unnamed: 0,gender,morning_drink,age,healthy_life_style,smoking,left_eye_color,stress_level,sleep_well,chronotype,wake_up_time,average_sleep_time,coffee_nearby,gourmet,office_worker,homebody,chronic_diseases?,write_hand,zodiac_sign
0,Женщина,Кофе,22,65,Да,Серо-зеленый,55,Да,Жаворонок,8:00,8.0,Да,Да,Да,Нет,Нет,Правой,Козерог
1,Мужчина,Чай,21,85,Нет,Зеленый,70,Да,Жаворонок,6:00,8.0,Да,Нет,Нет,Нет,Нет,Левой,Стрелец
2,Мужчина,Кофе,22,50,Да,Голубой,70,Да,Сова,9:00,7.0,Да,Да,Нет,Да,Нет,Правой,Рак
3,Мужчина,Кофе,22,80,Да,Карий,60,Да,Сова,9:00,6.0,Да,Да,Нет,Нет,Да,Правой,Скорпион
4,Женщина,Кофе,23,50,Нет,Голубой,90,Нет,Сова,14:00,6.0,Нет,Нет,Нет,Да,Да,Левой,Скорпион
5,Мужчина,Чай,21,99,Нет,Зелёный,1,Да,Жаворонок,6:30,8.0,Нет,Да,Нет,Нет,Нет,Правой,Овен
6,Мужчина,Чай,22,50,Нет,Голубой,70,Да,Сова,10:00,6.0,Нет,Нет,Нет,Нет,Нет,Правой,Рыбы
7,Мужчина,Чай,22,80,Нет,Голубой,45,Да,Сова,7:00,7.0,Да,Нет,Да,Нет,Да,Правой,Водолей
8,Мужчина,Кофе,22,50,Нет,Коричневый,70,Нет,Сова,0:00,8.0,Да,Да,Нет,Да,Нет,Левой,Дева
9,Мужчина,Чай,21,70,Да,Зеленый,100,Нет,Сова,0:30,6.0,Нет,Нет,Нет,Да,Нет,Правой,Водолей


In [143]:
joined_result_df = pandas.DataFrame(
    {
        "morning_drink": df["morning_drink"].values.tolist(),
        "similarity_rate": diffs.tolist()
    }
)
joined_result_df

Unnamed: 0,morning_drink,similarity_rate
0,Кофе,[0.6020617142550093]
1,Чай,[0.5774741236888619]
2,Кофе,[0.5663707925810778]
3,Кофе,[0.5669204773766123]
4,Кофе,[0.5365325044978276]
5,Чай,[0.9187453972470666]
6,Чай,[0.5551225906151427]
7,Чай,[0.6104897603660412]
8,Кофе,[0.5741149361583625]
9,Чай,[0.5311502590907982]


In [171]:
head_neighbours.value_counts()

morning_drink  similarity_rate
Кофе           0.770421           1
Чай            0.700390           1
               0.894481           1
               0.918459           1
               0.918745           1
Name: count, dtype: int64

In [168]:
head_neighbours

Unnamed: 0,morning_drink,similarity_rate
5,Чай,0.918745
18,Чай,0.918459
11,Чай,0.894481
29,Кофе,0.770421
30,Чай,0.70039


In [177]:
from collections import Counter

In [180]:
c = Counter(target_values)


Counter({'Чай': 4, 'Кофе': 1})

In [191]:
dict(c)

{'Чай': 4, 'Кофе': 1}