In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import yaml
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Прочтём файл конфига с путями

CONFIG_PATH = "config.yaml"
with open(CONFIG_PATH, "r", encoding="utf-8") as config_file:
    CONFIG = yaml.load(config_file, Loader=yaml.FullLoader)

In [3]:
# Загрузим необходимые датасеты

df = pd.read_csv(CONFIG['datasets_folder'] + '/data.csv')
post_df = pd.read_csv(CONFIG['data_folder'] + '/post_data.csv')
user_df = pd.read_csv(CONFIG['data_folder'] + '/user_data.csv')

In [4]:
df.head()

Unnamed: 0,timestamp,user_id,post_id,target,gender,age,country,city,exp_group,os,source,text,topic
0,2021-12-29 15:24:59,200,1773,0,1,34,Russia,Degtyarsk,3,Android,ads,Hearts 2-1 Livingston\n\nHearts wrapped up the...,sport
1,2021-12-15 17:47:13,271,1773,0,0,36,Turkey,Gaziantep,2,Android,ads,Hearts 2-1 Livingston\n\nHearts wrapped up the...,sport
2,2021-12-25 15:41:30,279,1773,0,0,30,Russia,Vladimir,3,Android,ads,Hearts 2-1 Livingston\n\nHearts wrapped up the...,sport
3,2021-12-28 20:24:32,324,1773,0,0,35,Russia,Neman,1,Android,ads,Hearts 2-1 Livingston\n\nHearts wrapped up the...,sport
4,2021-12-25 14:11:12,363,1773,0,0,18,Russia,Belgorod,2,Android,ads,Hearts 2-1 Livingston\n\nHearts wrapped up the...,sport


##### Обработаем признаки

In [5]:
# Временные признаки - выделим из времени совершённого действия месяц, число, день недели и час

df["timestamp"] = pd.to_datetime(df["timestamp"])

df['month'] = df["timestamp"].dt.month.astype(int)
df['hour'] = df["timestamp"].dt.hour.astype(int)
df['day'] = df["timestamp"].dt.day.astype(int)
df['weekday'] = df["timestamp"].dt.weekday.astype(int)

In [6]:
# Приведем некоторые категориальные признаки к бинарному виду

df['os'] = df['os'].map({'Android': 0, 'iOS': 1})
df['source'] = df['source'].map({'ads': 0, 'organic': 1})

user_df['os'] = user_df['os'].map({'Android': 0, 'iOS': 1})
user_df['source'] = user_df['source'].map({'ads': 0, 'organic': 1})

In [7]:
# Уберем колонку с текстом новости, т.к. обработка текста будет вестись через post_data.csv

df = df.drop(['text'], axis=1)

##### Выделим текстовые признаки

In [8]:
post_df.head()

Unnamed: 0,post_id,text,topic
0,1,UK economy facing major risks\n\nThe UK manufa...,business
1,2,Aids and climate top Davos agenda\n\nClimate c...,business
2,3,Asian quake hits European shares\n\nShares in ...,business
3,4,India power shares jump on debut\n\nShares in ...,business
4,5,Lacroix label bought by US firm\n\nLuxury good...,business


In [9]:
# Уберём символ переноса строки

post_df['text'] = post_df['text'].str.replace('\n',' ')

In [10]:
# Найдем tf-idf матрицу нашего корпуса текстов и выделим признаки

tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 3))
tfidf_df_ = tfidf.fit_transform(post_df['text'])

In [11]:
tfidf_df = []

for row in tfidf_df_:
    tfidf_df.append([row.sum(), row.mean(), row.max()])

In [12]:
tfidf_df = pd.DataFrame(tfidf_df, index=post_df['post_id'], columns=['tfidf_sum', 'tfidf_mean', 'tfidf_max']).reset_index()

In [13]:
tfidf_df.head()

Unnamed: 0,post_id,tfidf_sum,tfidf_mean,tfidf_max
0,1,20.580054,1.5e-05,0.249396
1,2,25.240271,1.9e-05,0.146967
2,3,28.351763,2.1e-05,0.143002
3,4,16.025785,1.2e-05,0.287929
4,5,14.818161,1.1e-05,0.231884


In [14]:
# Получим финальный датасет, на котором будем обучать модель и сохраним вместе с tf-idf признаками

df = df.merge(tfidf_df, on='post_id')
df = df.sort_values(by='timestamp')

df.to_csv(CONFIG['datasets_folder'] + '/processed_df.csv', index=False)
tfidf_df.to_csv(CONFIG['data_folder'] + '/tfidf_df.csv', index=False)
post_data.to_csv(CONFIG['data_folder'] + '/post_data.csv', index=False)
user_data.to_csv(CONFIG['data_folder'] + '/user_data.csv', index=False)