In [25]:
import pickle
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import XGBClassifier
from nltk.corpus import stopwords

import sklearn
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, GridSearchCV, ParameterGrid, train_test_split, cross_val_score

from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

import warnings
warnings.filterwarnings("ignore")

In [1]:
!pip freeze > requirements.txt

In [26]:
data = pd.read_csv('../data/CUAD_data/translated_data.csv', index_col=0)

In [27]:
stop_words = stopwords.words('russian')
stop_words.extend(["это", 'все', 'весь', 'включая', 'отношения', 'цель', "имя", "свой", "время", "любой", "соглашение", 
                   "сторона", "настоящий", "право", "компания", "соответствие", "который", "должный", "мочь", "также"])

In [28]:
filtered_sentence = []

for text in data.processed_text_ru.to_numpy():
    sent = []
    for word in text.split(' ')[0:200]:
        if word not in stop_words:
            sent.append(word)
    filtered_sentence.append(' '.join(sent))

In [29]:
codes, uniques = pd.factorize(data.target)
Y = codes
data['codes'] = codes
uniques

Index(['Distributor', 'Promotion', 'Supply', 'Hosting', 'Joint Venture',
       'Endorsement', 'Consulting', 'Marketing', 'Strategic Alliance',
       'Sponsorship', 'Franchise', 'IP', 'Service', 'Agency', 'Maintenance',
       'License', 'Collaboration', 'Reseller', 'Outsourcing', 'Co_Branding',
       'Affiliate', 'Development', 'Manufacturing', 'Transportation',
       'Non-Compete'],
      dtype='object')

In [30]:
rand_state = 21

In [31]:
pipe = Pipeline(steps=[('vect', CountVectorizer()),
                        ('est', LogisticRegression())])

# Подбор модели

In [32]:
param_grid = [
  {'vect': [CountVectorizer(), TfidfVectorizer()],
   'vect__min_df': [2, 4, 6],
   'vect__max_df': [0.9, 0.95],
   'vect__ngram_range': [(1, 1), (1, 2)],
   'est': [LogisticRegression()],
   'est__C': np.logspace(-3, 2, 6),
   'est__class_weight': [None, 'balanced'],
   'est__random_state': [rand_state]},
  {'vect': [CountVectorizer(), TfidfVectorizer()],
   'vect__min_df': [2, 4, 6],
   'vect__max_df': [0.9, 0.95],
   'vect__ngram_range': [(1, 1), (1, 2)],
   'est': [RandomForestClassifier(), XGBClassifier()],
   'est__n_estimators': [100, 250, 500],
   'est__class_weight': [None, 'balanced'],
   'est__max_depth': [3, 5, 7]},
  {'vect': [CountVectorizer()],
   'vect__min_df': [2, 4, 6],
   'vect__max_df': [0.9, 0.95],
   'vect__ngram_range': [(1, 1), (1, 2)],
   'est': [GaussianNB()]}
 ]


In [33]:
skf = StratifiedKFold(n_splits=3,
                      shuffle=True,
                      random_state=rand_state)

In [34]:
gs = GridSearchCV(estimator=pipe, param_grid=param_grid, scoring='f1_macro', n_jobs=-1, cv=skf, verbose=1, return_train_score=True)


In [36]:
gs.fit(filtered_sentence, Y)

In [127]:
gs.best_estimator_

In [210]:
gs.best_params_

{'est': XGBClassifier(base_score=None, booster=None, callbacks=None, class_weight=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric=None, feature_types=None,
               gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
               interaction_constraints=None, learning_rate=None, max_bin=None,
               max_cat_threshold=None, max_cat_to_onehot=None,
               max_delta_step=None, max_depth=7, max_leaves=None,
               min_child_weight=None, missing=nan, monotone_constraints=None,
               n_estimators=100, n_jobs=None, num_parallel_tree=None,
               predictor=None, ...),
 'est__class_weight': None,
 'est__max_depth': 7,
 'est__n_estimators': 100,
 'vect': CountVectorizer(max_df=0.9, min_df=2, ngram_range=(1, 2)),
 'vect__max_df': 0.9,
 'vect__min_df': 2,
 'vect__ngram_range': (1, 2)}

In [129]:
gs.best_score_

0.7779524256522506

In [613]:
df = pd.DataFrame(gs.cv_results_).sort_values('mean_test_score', ascending=False)

# Подбор параметров модели для снижения переобучения

In [37]:

param_grid_RF = [
  {'vect': [CountVectorizer()],
   'vect__min_df': [1],
   'vect__max_df': [0.85],
   'vect__ngram_range': [(1, 2), (1, 1)],
   'est': [RandomForestClassifier()],
   'est__n_estimators': [400, 500, 600],
   'est__class_weight': ['balanced'],
   'est__max_depth': [3, 4],
   'est__random_state': [rand_state],
   'est__min_samples_leaf': [2, 3],
   }
 ]


In [38]:
gs_RF = GridSearchCV(estimator=pipe, param_grid=param_grid_RF, scoring='f1_macro', n_jobs=-1, cv=skf, verbose=2, return_train_score=True)


In [39]:
gs_RF.fit(filtered_sentence, Y)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


In [41]:
df_RF = pd.DataFrame(gs_RF.cv_results_).sort_values('mean_test_score', ascending=False)

In [53]:
overfitting_rate = (df_RF.iloc[0]['mean_train_score'] - df_RF.iloc[0]['mean_test_score']) / df_RF.iloc[0]['mean_train_score']
overfitting_rate

0.21578778999708897

In [46]:
gs_RF.best_score_

0.7292695377539906

In [47]:
gs_RF.best_params_

{'est': RandomForestClassifier(class_weight='balanced', max_depth=3, min_samples_leaf=2,
                        n_estimators=600, random_state=21),
 'est__class_weight': 'balanced',
 'est__max_depth': 3,
 'est__min_samples_leaf': 2,
 'est__n_estimators': 600,
 'est__random_state': 21,
 'vect': CountVectorizer(max_df=0.85, ngram_range=(1, 2)),
 'vect__max_df': 0.85,
 'vect__min_df': 1,
 'vect__ngram_range': (1, 2)}

In [49]:
fi = gs_RF.best_estimator_['est'].feature_importances_
f_v = gs_RF.best_estimator_['vect'].get_feature_names_out()
f_v[np.argsort(fi)[::-1]][0:50]

array(['транспортный услуга', 'хостинг', 'сотрудничество',
       'договор поставка', 'альянс', 'собственность', 'торговый',
       'неконкуренция', 'транспортировка', 'дистрибьюторский',
       'торговый посредник', 'спонсорский', 'спонсорский спонсорский',
       'посредник', 'перевозчик', 'аутсорсинг', 'дистрибьютор',
       'интеллектуальный', 'консультант консультант',
       'альянс стратегический', 'стратегический альянс', 'подтверждение',
       'услуга', 'обслуживание', 'спонсор', 'консультант предоставлять',
       'франчайзинговый', 'стратегический', 'приложение совместный',
       'альянс заключить', 'внимание грузоотправитель', 'поправка иметь',
       'агентский', 'приложение спонсорский', 'остров', 'спонсорство',
       'дистрибьюторский заключить', 'консультационный', 'дочерний',
       'коммерческий предприятие', 'партнёрский', 'франшиза',
       'консультант нести', 'консультант соглашаться',
       'поставка заключить', 'договор франшиза', 'кобрендинг',
       'консу

In [51]:
print(metrics.classification_report(Y, gs_RF.predict(filtered_sentence)))

              precision    recall  f1-score   support

           0       0.84      0.97      0.90        32
           1       1.00      0.75      0.86        12
           2       0.95      1.00      0.97        18
           3       0.91      1.00      0.95        20
           4       0.91      0.91      0.91        23
           5       0.85      0.96      0.90        24
           6       0.69      1.00      0.81        11
           7       1.00      0.53      0.69        17
           8       0.97      1.00      0.98        32
           9       0.89      1.00      0.94        31
          10       0.88      1.00      0.94        15
          11       0.81      1.00      0.89        17
          12       0.89      0.57      0.70        28
          13       0.93      1.00      0.96        13
          14       0.88      0.85      0.87        34
          15       0.90      0.82      0.86        33
          16       0.93      0.96      0.94        26
          17       0.92    

In [52]:
for train_index, test_index in skf.split(filtered_sentence, Y):
    X_train = np.array(filtered_sentence)[train_index]
    y_train = Y[train_index]
    
    X_test = np.array(filtered_sentence)[test_index]
    y_test = Y[test_index]
    
    model = Pipeline([('vect', CountVectorizer(max_df=0.85, ngram_range=(1, 2))), 
                      ('est', RandomForestClassifier(class_weight='balanced', max_depth=4, min_samples_leaf=2,
                        n_estimators=600, random_state=21))])
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.82      0.86        11
           1       1.00      0.50      0.67         4
           2       0.62      0.83      0.71         6
           3       0.75      0.86      0.80         7
           4       0.78      1.00      0.88         7
           5       0.80      1.00      0.89         8
           6       0.80      1.00      0.89         4
           7       1.00      0.17      0.29         6
           8       0.71      1.00      0.83        10
           9       0.85      1.00      0.92        11
          10       0.83      1.00      0.91         5
          11       0.56      1.00      0.71         5
          12       0.50      0.20      0.29        10
          13       0.80      1.00      0.89         4
          14       0.75      0.55      0.63        11
          15       0.89      0.73      0.80        11
          16       0.73      0.89      0.80         9
          17       0.40    

# Обучение и выгрузка итоговой модели

In [16]:
X_train = filtered_sentence
y_train = Y

model = Pipeline([('vect', CountVectorizer(max_df=0.85, ngram_range=(1, 2))), 
                  ('est', RandomForestClassifier(class_weight='balanced', max_depth=4, min_samples_leaf=2,
                    n_estimators=600, random_state=21))])

model.fit(X_train, y_train)
y_pred = model.predict(X_train)

print(metrics.classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.94      0.90        32
           1       0.90      0.75      0.82        12
           2       1.00      1.00      1.00        18
           3       0.91      1.00      0.95        20
           4       0.96      0.96      0.96        23
           5       0.81      0.92      0.86        24
           6       0.73      1.00      0.85        11
           7       1.00      0.71      0.83        17
           8       0.94      1.00      0.97        32
           9       0.91      1.00      0.95        31
          10       0.94      1.00      0.97        15
          11       0.77      1.00      0.87        17
          12       0.90      0.68      0.78        28
          13       0.93      1.00      0.96        13
          14       0.94      0.91      0.93        34
          15       0.93      0.85      0.89        33
          16       1.00      0.96      0.98        26
          17       0.92    

In [54]:
with open('../models/CUAD_baseline_RF_14_12.pkl', 'wb') as f:
    pickle.dump(model, f)

In [553]:
from sklearn.feature_selection import chi2

category_to_id = {}
for idx, i in enumerate(uniques):
    category_to_id[i] = idx
    
    
N = 10
for Product, category_id in sorted(category_to_id.items()):
    with open(f'{Product}.txt', 'w') as f:
        features_chi2 = chi2(X, Y == category_id)
        indices = np.argsort(features_chi2[0])
        feature_names = np.array(vectorizer.get_feature_names_out())[indices]
        unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
        bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
        for v in unigrams[-N:]:
            f.write(f'{v}\n')
        for v in bigrams[-N:]:
            f.write(f'{v}\n')
            
        print("\n==> %s:" %(Product))
        print(unigrams[-N:])
print(bigrams[-N:])