In [1]:
import pickle
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import XGBClassifier
from nltk.corpus import stopwords

import sklearn
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, GridSearchCV, ParameterGrid, train_test_split, cross_val_score

from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

import warnings
warnings.filterwarnings("ignore")

In [3]:
data = pd.read_csv('../data/CUAD_data/translated_data.csv', index_col=0)

In [4]:
stop_words = stopwords.words('russian')
stop_words.extend(["это", 'все', 'весь', 'включая', 'отношения', 'цель', "имя", "свой", "время", "любой", "соглашение", 
                   "сторона", "настоящий", "право", "компания", "соответствие", "который", "должный", "мочь", "также"])

In [5]:
filtered_sentence = []

for text in data.processed_text_ru.to_numpy():
    sent = []
    for word in text.split(' ')[0:200]:
        if word not in stop_words:
            sent.append(word)
    filtered_sentence.append(' '.join(sent))

In [6]:
codes, uniques = pd.factorize(data.target)
Y = codes
data['codes'] = codes
uniques

Index(['Distributor', 'Promotion', 'Supply', 'Hosting', 'Joint Venture',
       'Endorsement', 'Consulting', 'Marketing', 'Strategic Alliance',
       'Sponsorship', 'Franchise', 'IP', 'Service', 'Agency', 'Maintenance',
       'License', 'Collaboration', 'Reseller', 'Outsourcing', 'Co_Branding',
       'Affiliate', 'Development', 'Manufacturing', 'Transportation',
       'Non-Compete'],
      dtype='object')

In [7]:
rand_state = 21

In [8]:
pipe = Pipeline(steps=[('vect', CountVectorizer()),
                        ('est', LogisticRegression())])

In [19]:
X_train, X_test, y_train, y_test = train_test_split(
                                    filtered_sentence, Y, shuffle=True, stratify=Y, random_state=42, test_size=0.2)

# Подбор модели

In [20]:
param_grid = [
  {'vect': [CountVectorizer(), TfidfVectorizer()],
   'vect__min_df': [2, 4, 6],
   'vect__max_df': [0.9, 0.95],
   'vect__ngram_range': [(1, 1), (1, 2)],
   'est': [LogisticRegression()],
   'est__C': np.logspace(-3, 2, 6),
   'est__class_weight': [None, 'balanced'],
   'est__random_state': [rand_state]},
  {'vect': [CountVectorizer(), TfidfVectorizer()],
   'vect__min_df': [2, 4, 6],
   'vect__max_df': [0.9, 0.95],
   'vect__ngram_range': [(1, 1), (1, 2)],
   'est': [RandomForestClassifier(), XGBClassifier()],
   'est__n_estimators': [100, 250, 500],
   'est__class_weight': [None, 'balanced'],
   'est__max_depth': [3, 5, 7]},
  {'vect': [CountVectorizer()],
   'vect__min_df': [2, 4, 6],
   'vect__max_df': [0.9, 0.95],
   'vect__ngram_range': [(1, 1), (1, 2)],
   'est': [GaussianNB()]}
 ]


In [21]:
skf = StratifiedKFold(n_splits=3,
                      shuffle=True,
                      random_state=rand_state)

In [22]:
gs = GridSearchCV(estimator=pipe, param_grid=param_grid, scoring='f1_macro', n_jobs=-1, cv=skf, verbose=1, return_train_score=True)


In [23]:
gs.fit(X_train, y_train)

Fitting 3 folds for each of 1164 candidates, totalling 3492 fits


In [24]:
gs.best_estimator_

In [25]:
gs.best_params_

{'est': RandomForestClassifier(class_weight='balanced', max_depth=7, n_estimators=250),
 'est__class_weight': 'balanced',
 'est__max_depth': 7,
 'est__n_estimators': 250,
 'vect': TfidfVectorizer(max_df=0.95, min_df=4, ngram_range=(1, 2)),
 'vect__max_df': 0.95,
 'vect__min_df': 4,
 'vect__ngram_range': (1, 2)}

In [27]:
gs.best_score_

0.7755805025605188

In [28]:
df = pd.DataFrame(gs.cv_results_).sort_values('mean_test_score', ascending=False)

# Подбор параметров модели для снижения переобучения

In [33]:

param_grid_RF = [
  {'vect': [CountVectorizer()],
   'vect__min_df': [1],
   'vect__max_df': [0.85],
   'vect__ngram_range': [(1, 2), (1, 1)],
   'est': [RandomForestClassifier()],
   'est__n_estimators': [400, 500, 600],
   'est__class_weight': ['balanced'],
   'est__max_depth': [3, 4],
   'est__random_state': [rand_state],
   'est__min_samples_leaf': [2, 3],
   }
 ]


In [34]:
gs_RF = GridSearchCV(estimator=pipe, param_grid=param_grid_RF, scoring='f1_macro', n_jobs=-1, cv=skf, verbose=2, return_train_score=True)


In [35]:
gs_RF.fit(X_train, y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


In [36]:
df_RF = pd.DataFrame(gs_RF.cv_results_).sort_values('mean_test_score', ascending=False)

In [37]:
overfitting_rate = (df_RF.iloc[0]['mean_train_score'] - df_RF.iloc[0]['mean_test_score']) / df_RF.iloc[0]['mean_train_score']
overfitting_rate

0.24004719139510847

In [38]:
gs_RF.best_score_

0.7323699586065427

In [39]:
gs_RF.best_params_

{'est': RandomForestClassifier(class_weight='balanced', max_depth=4, min_samples_leaf=2,
                        n_estimators=600, random_state=21),
 'est__class_weight': 'balanced',
 'est__max_depth': 4,
 'est__min_samples_leaf': 2,
 'est__n_estimators': 600,
 'est__random_state': 21,
 'vect': CountVectorizer(max_df=0.85, ngram_range=(1, 2)),
 'vect__max_df': 0.85,
 'vect__min_df': 1,
 'vect__ngram_range': (1, 2)}

In [40]:
fi = gs_RF.best_estimator_['est'].feature_importances_
f_v = gs_RF.best_estimator_['vect'].get_feature_names_out()
f_v[np.argsort(fi)[::-1]][0:50]

array(['стратегический', 'хостинг', 'аутсорсинг', 'стратегический альянс',
       'интеллектуальный', 'господин', 'спонсорский', 'посредник',
       'поставка', 'интеллектуальный собственность', 'брендинг', 'акция',
       'франшиза', 'торговый посредник', 'торговый', 'партнёрский',
       'сотрудничество', 'услуга консультант', 'собственность', 'альянс',
       'консультант', 'дама господин', 'грузоотправитель', 'привлечение',
       'договор соглашаться', 'транспортный', 'консультант предоставлять',
       'обслуживание', 'номинальный', 'франчайзи', 'альянс заключить',
       'предложение', 'совместный брендинг', 'приложение сотрудничество',
       'франчайзинговый', 'разработка', 'дистрибьютор', 'рекламный',
       'перевозчик', 'партнёр', 'колумбия', 'производственный',
       'приложение интеллектуальный', 'граница', 'кобрендинг',
       'сберегательный', 'иметь офис', 'акция агентский', 'дама',
       'полезный'], dtype=object)

In [41]:
print(metrics.classification_report(y_train, gs_RF.predict(X_train)))

              precision    recall  f1-score   support

           0       0.92      0.92      0.92        26
           1       1.00      0.90      0.95        10
           2       0.88      1.00      0.93        14
           3       0.89      1.00      0.94        16
           4       1.00      0.94      0.97        18
           5       1.00      0.95      0.97        19
           6       0.75      1.00      0.86         9
           7       1.00      0.64      0.78        14
           8       0.93      1.00      0.96        26
           9       0.96      1.00      0.98        25
          10       1.00      1.00      1.00        12
          11       0.82      1.00      0.90        14
          12       0.94      0.77      0.85        22
          13       0.91      1.00      0.95        10
          14       0.92      0.89      0.91        27
          15       0.90      1.00      0.95        26
          16       0.95      1.00      0.98        21
          17       0.91    

In [42]:
for train_index, test_index in skf.split(X_train, y_train):
    X_train_cv = np.array(X_train)[train_index]
    y_train_cv = y_train[train_index]
    
    X_test_cv = np.array(X_train)[test_index]
    y_test_cv = y_train[test_index]
    
    model = Pipeline([('vect', CountVectorizer(max_df=0.85, ngram_range=(1, 2))), 
                      ('est', RandomForestClassifier(class_weight='balanced', max_depth=4, min_samples_leaf=2,
                        n_estimators=600, random_state=21))])
    
    model.fit(X_train_cv, y_train_cv)
    y_pred = model.predict(X_test_cv)
    
    print(metrics.classification_report(y_test_cv, y_pred))

              precision    recall  f1-score   support

           0       0.89      1.00      0.94         8
           1       0.50      0.25      0.33         4
           2       0.83      1.00      0.91         5
           3       0.50      0.60      0.55         5
           4       1.00      1.00      1.00         6
           5       0.86      0.86      0.86         7
           6       0.50      1.00      0.67         3
           7       1.00      0.40      0.57         5
           8       1.00      1.00      1.00         8
           9       0.88      0.88      0.88         8
          10       0.80      1.00      0.89         4
          11       0.71      1.00      0.83         5
          12       0.50      0.12      0.20         8
          13       0.75      1.00      0.86         3
          14       0.64      0.78      0.70         9
          15       0.88      0.88      0.88         8
          16       0.86      0.86      0.86         7
          17       1.00    

# Обучение и выгрузка итоговой модели

In [44]:

model = Pipeline([('vect', CountVectorizer(max_df=0.85, ngram_range=(1, 2))), 
                  ('est', RandomForestClassifier(class_weight='balanced', max_depth=4, min_samples_leaf=2,
                    n_estimators=600, random_state=21))])

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.67      0.67      0.67         6
           1       0.00      0.00      0.00         2
           2       0.67      1.00      0.80         4
           3       0.80      1.00      0.89         4
           4       1.00      0.80      0.89         5
           5       1.00      1.00      1.00         5
           6       0.40      1.00      0.57         2
           7       0.00      0.00      0.00         3
           8       0.86      1.00      0.92         6
           9       0.75      1.00      0.86         6
          10       0.75      1.00      0.86         3
          11       0.60      1.00      0.75         3
          12       0.67      0.33      0.44         6
          13       0.25      0.33      0.29         3
          14       0.83      0.71      0.77         7
          15       0.67      0.29      0.40         7
          16       1.00      0.60      0.75         5
          17       1.00    

In [45]:
model = Pipeline([('vect', CountVectorizer(max_df=0.85, ngram_range=(1, 2))), 
                  ('est', RandomForestClassifier(class_weight='balanced', max_depth=4, min_samples_leaf=2,
                    n_estimators=600, random_state=21))])

model.fit(filtered_sentence, Y)

In [47]:
with open('../models/CUAD_baseline_RF_14_01.pkl', 'wb') as f:
    pickle.dump(model, f)