In [19]:
import os
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, precision_score, f1_score
import pickle

data = pd.read_excel('preprocessed_data_2018.xlsx', index_col=0).set_index('ID')
cvect = CountVectorizer(min_df=0.05, ngram_range=(1, 2)).fit(data['text_prep'])

pkl_filename = "cvect.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(cvect, file)
    
matrix = cvect.transform(data['text_prep'])
td_matrix = pd.DataFrame(matrix.toarray(), index=data.index, columns=cvect.get_feature_names())
td_matrix.head()

Unnamed: 0_level_0,абонентский,абонентский номер,август,автомашина,автомобиль,агрессивный,агрессия,адвокат,адвокатский,адекватно,...,являться причина,являться родной,явный,ягодица,язык,якобы,январь,яремный,яремный вена,ящик
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
82060183bdf7472e85540c4370d3693c,0,0,0,0,0,1,0,2,0,0,...,0,1,0,0,0,0,0,0,0,0
377054bb5594776bb30ff64c6ebcffb8,0,0,0,0,0,0,0,3,0,0,...,0,0,0,0,0,0,0,0,0,0
6df614cdfd7d029cde776d52ed7932d2,0,0,1,0,0,2,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
50593bc68988c7aae8f9f2f2bf243f8a,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,3,0,0,0
54e009a196b97005a1b4d828670345e7,0,0,0,0,5,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
td_matrix['DV'] = data['DV']
td_matrix['IPV'] = data['IPV']

y_dv = td_matrix['DV'].astype(int)
X_dv = td_matrix.drop(["DV", 'IPV'], axis=1)

y_ipv = td_matrix[td_matrix['IPV'].notna()]['IPV'].astype(int).replace({2: 0}) # 0 - родственник, 1 - партнер
X_ipv = td_matrix[td_matrix['IPV'].notna()].drop(["DV", 'IPV'], axis=1)
print(y_dv.shape, X_dv.shape, y_ipv.shape, X_ipv.shape)

(1632,) (1632, 5522) (1128,) (1128, 5522)


In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_dv, y_dv, test_size=0.1, random_state=42)

In [22]:
gbc = GradientBoostingClassifier(max_depth=3, n_estimators=500, random_state=42)
gbc.fit(X_train, y_train)
pkl_filename = "gbc_dv.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(gbc, file)
prec_train = precision_score(y_train, gbc.predict(X_train))
prec_test = precision_score(y_test, gbc.predict(X_test))
print(classification_report(y_train, gbc.predict(X_train)))
print(classification_report(y_test, gbc.predict(X_test)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       457
           1       1.00      1.00      1.00      1011

    accuracy                           1.00      1468
   macro avg       1.00      1.00      1.00      1468
weighted avg       1.00      1.00      1.00      1468

              precision    recall  f1-score   support

           0       0.88      0.62      0.73        47
           1       0.86      0.97      0.91       117

    accuracy                           0.87       164
   macro avg       0.87      0.79      0.82       164
weighted avg       0.87      0.87      0.86       164



In [23]:
X_train, X_test, y_train, y_test = train_test_split(X_ipv, y_ipv, test_size=0.1, random_state=42)

In [24]:
gbc = GradientBoostingClassifier(max_depth=3, n_estimators=50, random_state=42)
gbc.fit(X_train, y_train)
pkl_filename = "gbc_ipv.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(gbc, file)
prec_train = precision_score(y_train, gbc.predict(X_train))
prec_test = precision_score(y_test, gbc.predict(X_test))
print(classification_report(y_train, gbc.predict(X_train)))
print(classification_report(y_test, gbc.predict(X_test)))

              precision    recall  f1-score   support

           0       0.99      0.78      0.87       203
           1       0.95      1.00      0.97       812

    accuracy                           0.95      1015
   macro avg       0.97      0.89      0.92      1015
weighted avg       0.96      0.95      0.95      1015

              precision    recall  f1-score   support

           0       1.00      0.70      0.82        20
           1       0.94      1.00      0.97        93

    accuracy                           0.95       113
   macro avg       0.97      0.85      0.90       113
weighted avg       0.95      0.95      0.94       113

