<a href="https://colab.research.google.com/github/IlyaGenShubin/VSR/blob/main/Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
import time
import pandas as pd
from joblib import dump, load
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix,classification_report

In [5]:
events = pd.read_csv('/content/drive/MyDrive/VSR/train_events.csv')
events.head()

Unnamed: 0,event_timestamp,region,ua_device_type,ua_client_type,ua_os,ua_client_name,total_watchtime,rutube_video_id,viewer_uid
0,2024-06-01 06:40:58+03:00,Chelyabinsk,desktop,browser,Windows,Yandex Browser,1883,video_133074,10067243
1,2024-06-01 19:33:24+03:00,Bashkortostan Republic,smartphone,mobile app,Android,Rutube,512,video_362960,10245341
2,2024-06-01 21:30:43+03:00,St.-Petersburg,desktop,browser,Windows,Chrome,5647,video_96775,10894333
3,2024-06-01 23:03:42+03:00,Moscow,smartphone,mobile app,Android,Rutube,1521,video_161610,10029092
4,2024-06-01 22:48:09+03:00,Moscow,smartphone,mobile app,Android,Rutube,71,video_116245,10452976


In [6]:
targets = pd.read_csv('/content/drive/MyDrive/VSR/train_targets.csv')
targets.head()

Unnamed: 0,viewer_uid,age,sex,age_class
0,10087154,30,male,1
1,10908708,25,female,1
2,10190464,34,male,2
3,10939673,25,male,1
4,10288257,48,male,3


In [7]:
video = pd.read_csv('/content/drive/MyDrive/VSR/video_info_v2.csv')
video.head()

Unnamed: 0,rutube_video_id,title,category,duration,author_id
0,video_185549,Как собрать букет из мыльных тюльпанов - Силик...,Хобби,1559160,1015054
1,video_111035,"Осторожно, Киберземляне!, 1 сезон, 12 серия",Сериалы,1320007,1002180
2,video_476517,ПОПУЛЯРНЫЕ ВИДЕОИГРЫ в LEGO... перевод - TD BR...,Хобби,606145,1095337
3,video_157198,"Хороший лжец (фильм, 2019)",Фильмы,6577440,1043618
4,video_289824,Нашего старого гнобят по-всякому,Развлечения,859493,1009535


In [8]:
# объединение DataFrame
data = pd.merge(events, targets, on='viewer_uid', how='inner')
data = pd.merge(data, video, on='rutube_video_id', how='inner')
data.head()

Unnamed: 0,event_timestamp,region,ua_device_type,ua_client_type,ua_os,ua_client_name,total_watchtime,rutube_video_id,viewer_uid,age,sex,age_class,title,category,duration,author_id
0,2024-06-01 06:40:58+03:00,Chelyabinsk,desktop,browser,Windows,Yandex Browser,1883,video_133074,10067243,20,female,0,Папа с особенностями. Мужское / Женское. Выпус...,Телепередачи,2456534,1009219
1,2024-06-01 19:01:56+03:00,Tver Oblast,tablet,mobile app,Android,Rutube,2325,video_133074,10069705,29,female,1,Папа с особенностями. Мужское / Женское. Выпус...,Телепередачи,2456534,1009219
2,2024-06-01 16:33:40+03:00,Irkutsk Oblast,tablet,mobile app,Android,Rutube,830,video_133074,10050175,29,female,1,Папа с особенностями. Мужское / Женское. Выпус...,Телепередачи,2456534,1009219
3,2024-06-01 21:18:14+03:00,Irkutsk Oblast,smartphone,mobile app,Android,Rutube,2411,video_133074,10337828,22,female,1,Папа с особенностями. Мужское / Женское. Выпус...,Телепередачи,2456534,1009219
4,2024-06-01 01:19:06+03:00,St.-Petersburg,tablet,mobile app,Android,Rutube,2450,video_133074,10034659,26,female,1,Папа с особенностями. Мужское / Женское. Выпус...,Телепередачи,2456534,1009219


In [9]:
train_df = data[['title', 'viewer_uid', 'sex', 'age_class']]
train_df.head()

Unnamed: 0,title,viewer_uid,sex,age_class
0,Папа с особенностями. Мужское / Женское. Выпус...,10067243,female,0
1,Папа с особенностями. Мужское / Женское. Выпус...,10069705,female,1
2,Папа с особенностями. Мужское / Женское. Выпус...,10050175,female,1
3,Папа с особенностями. Мужское / Женское. Выпус...,10337828,female,1
4,Папа с особенностями. Мужское / Женское. Выпус...,10034659,female,1


In [10]:
train, test = train_test_split(train_df, train_size=0.8, shuffle=True, random_state=11)

vec = TfidfVectorizer(stop_words = 'english')

train_x = vec.fit_transform(train['title'])
test_x= vec.transform(test['title'])

In [13]:
# Start time
start_time = time.time()

sex = LogisticRegression(C=2, random_state=42, class_weight='balanced', max_iter=100000)
# Training the model
sex.fit(train_x, train['sex'])
pred_sex = sex.predict(train_x)
print('\nConfusion matrix\n', confusion_matrix(train['sex'], pred_sex))
print(classification_report(train['sex'], pred_sex))

# Сохранение модели Пол
dump(sex, 'sex_model.joblib')

# End time
end_time = time.time()

# Calculate and display the execution time
execution_time = end_time - start_time
print(f"Время обучения: {execution_time} секунд.")


Confusion matrix
 [[692940 174509]
 [138843 401400]]
              precision    recall  f1-score   support

      female       0.83      0.80      0.82    867449
        male       0.70      0.74      0.72    540243

    accuracy                           0.78   1407692
   macro avg       0.77      0.77      0.77   1407692
weighted avg       0.78      0.78      0.78   1407692

Время обучения: 154.58004450798035 секунд.


In [14]:
# Creating a logistic regression model
age = LogisticRegression(C=2, random_state=42, class_weight='balanced', max_iter=100000)
# Training the model
age.fit(train_x, train['age_class'])
pred_age = age.predict(train_x)
print('\nConfusion matrix\n', confusion_matrix(train['age_class'], pred_age))
print(classification_report(train['age_class'], pred_age))

# Сохранение модели Возраст
dump(age, 'age_model.joblib')


Confusion matrix
 [[ 31485  13652   4193   3730]
 [142300 229304  85157  66229]
 [107493 165287 133680 132431]
 [ 39627  48809  39876 164439]]
              precision    recall  f1-score   support

           0       0.10      0.59      0.17     53060
           1       0.50      0.44      0.47    522990
           2       0.51      0.25      0.33    538891
           3       0.45      0.56      0.50    292751

    accuracy                           0.40   1407692
   macro avg       0.39      0.46      0.37   1407692
weighted avg       0.48      0.40      0.41   1407692



['age_model.joblib']

In [15]:
# Загрузка моделей
age = load('age_model.joblib')
sex = load('sex_model.joblib')

In [21]:
pred_age = age.predict(test_x)
print('\nConfusion matrix\n', confusion_matrix(test['age_class'], pred_age))
print(classification_report(test['age_class'], pred_age))


Confusion matrix
 [[ 6684  3817  1324  1412]
 [35900 55781 21942 17144]
 [27222 42051 30894 34739]
 [10194 12559 11258 39003]]
              precision    recall  f1-score   support

           0       0.08      0.50      0.14     13237
           1       0.49      0.43      0.46    130767
           2       0.47      0.23      0.31    134906
           3       0.42      0.53      0.47     73014

    accuracy                           0.38    351924
   macro avg       0.37      0.42      0.34    351924
weighted avg       0.45      0.38      0.39    351924



In [18]:
pred_sex = sex.predict(test_x)
print('\nConfusion matrix\n', confusion_matrix(test['sex'], pred_sex))
print(classification_report(test['sex'], pred_sex))


Confusion matrix
 [[172976  44090]
 [ 35124  99734]]
              precision    recall  f1-score   support

      female       0.83      0.80      0.81    217066
        male       0.69      0.74      0.72    134858

    accuracy                           0.77    351924
   macro avg       0.76      0.77      0.76    351924
weighted avg       0.78      0.77      0.78    351924



In [20]:
f1_weighted = f1_score(test['age_class'], pred_age, average='weighted', zero_division = 0)
accuracy = accuracy_score(test['sex'], pred_sex)
final_score = 0.7 * f1_weighted + 0.3 * accuracy
print(f'Weighted F1 = {f1_weighted:.4f} \nAccuracy = {accuracy:.4f} \nFinal Score = {final_score:.4f}')

Weighted F1 = 0.3907 
Accuracy = 0.7749 
Final Score = 0.5060


In [22]:
submission = pd.DataFrame(columns=['viewer_uid', 'sex', 'age_class'])
submission['viewer_uid'] = test['viewer_uid'].values
submission['age_class'] = age.predict(test_x)
submission['sex'] = sex.predict(test_x)
submission.head()

Unnamed: 0,viewer_uid,sex,age_class
0,10133205,male,3
1,10164734,female,0
2,10096444,female,2
3,10020303,female,0
4,10024684,female,0


In [23]:
submission.to_csv('/content/submission.csv', index=False)