In [51]:
import pandas as pd

In [52]:
df_balanced = pd.read_csv('balanced_df.csv')

In [53]:
df_balanced['timestamp'] = pd.to_datetime(df_balanced['timestamp'])

In [54]:
df_balanced = df_balanced.sort_values(by='timestamp')

In [55]:
X = df_balanced.drop(['timestamp','Unnamed: 0','target'], axis=1)
y = df_balanced['target']

In [56]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [57]:
from sklearn.compose import ColumnTransformer
from category_encoders.one_hot import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

In [58]:
text_column = 'text'
one_hot_colum = 'topic'
categorical_preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(), one_hot_colum),
        ('tfidf', TfidfVectorizer(), text_column)
    ],
    remainder='passthrough'
)

In [59]:
from sklearn.pipeline import Pipeline
from catboost import CatBoostClassifier

In [60]:
# Create a pipeline that first applies the categorical preprocessor and then the scaler
pipe = Pipeline(steps=[
    ('categorical', categorical_preprocessor),
    ('model', CatBoostClassifier(eta=0.01,max_depth=8,n_estimators=200))
])

In [61]:
pipe.fit(X,y)

0:	learn: 0.6929324	total: 7.76s	remaining: 25m 45s
1:	learn: 0.6927059	total: 18.4s	remaining: 30m 21s
2:	learn: 0.6925118	total: 25.1s	remaining: 27m 28s
3:	learn: 0.6922733	total: 34.6s	remaining: 28m 17s
4:	learn: 0.6920321	total: 42s	remaining: 27m 19s
5:	learn: 0.6918769	total: 49.6s	remaining: 26m 43s
6:	learn: 0.6916710	total: 57.7s	remaining: 26m 31s
7:	learn: 0.6914653	total: 1m 5s	remaining: 26m 10s
8:	learn: 0.6912686	total: 1m 12s	remaining: 25m 29s
9:	learn: 0.6911185	total: 1m 17s	remaining: 24m 25s
10:	learn: 0.6909036	total: 1m 23s	remaining: 23m 52s
11:	learn: 0.6907484	total: 1m 32s	remaining: 24m 16s
12:	learn: 0.6905491	total: 1m 39s	remaining: 23m 45s
13:	learn: 0.6903852	total: 1m 45s	remaining: 23m 19s
14:	learn: 0.6902019	total: 1m 51s	remaining: 22m 53s
15:	learn: 0.6900392	total: 2m 4s	remaining: 23m 48s
16:	learn: 0.6898805	total: 2m 13s	remaining: 23m 56s
17:	learn: 0.6897234	total: 2m 19s	remaining: 23m 34s
18:	learn: 0.6895610	total: 2m 25s	remaining: 23m

In [62]:
preds = pipe.predict(X_test)

In [63]:
import pickle

In [64]:
with open('model_pipline.pkl', 'wb') as f:
    pickle.dump(pipe, f)

In [65]:
with open('model_pipline.pkl', 'rb') as f:
    model = pickle.load(f)

In [66]:
model

In [67]:
from sklearn.metrics import classification_report, accuracy_score

In [68]:
print(classification_report(y_test,preds))

              precision    recall  f1-score   support

           0       0.60      0.48      0.53    106150
           1       0.57      0.67      0.61    105755

    accuracy                           0.58    211905
   macro avg       0.58      0.58      0.57    211905
weighted avg       0.58      0.58      0.57    211905



In [70]:
print(accuracy_score(y_test, preds))

0.5786885632712773
