In [57]:
import numpy as np
from datasets import load_from_disk
import pandas as pd

In [58]:
dataset = load_from_disk("C:/Users/shash/Documents/NII/ssh/prepared-actual")
dataset.set_format(type='pandas')

In [59]:
df_train = dataset['train'][:]
df_test = dataset['test'][:]
df_valid = dataset['validation'][:]

In [60]:
df_test

Unnamed: 0,text,sport,economics,society,science,culture,politics,auto,accidents,lang,__index_level_0__
0,"طالب عضو تكتل ""لبنان القوي"" النائب سيمون ابي ر...",0,0,0,0,0,1,0,0,ar,94395
1,فولفو أعلنت في يوليو الماضي سحب سيارات في مختل...,0,0,0,0,0,0,1,0,ar,73171
2,El Departamento de Cultura del Gobierno Vasco ...,0,0,0,0,1,0,0,0,es,332252
3,En el año más inestable que ha vivido el gremi...,0,0,1,0,0,0,0,0,es,307444
4,يشهد اليوم الأحد، 8 مواجهات قوية فى خامس أيام ...,1,0,0,0,0,0,0,0,ar,9768
...,...,...,...,...,...,...,...,...,...,...,...
14655,У майбутньому воротар хотів би спробувати свої...,1,0,0,0,0,0,0,0,uk,700618
14656,قام المهندس ممدوح رسلان، رئيس الشركة القابضة ل...,0,1,0,0,0,0,0,0,ar,65308
14657,أجازت الهيئة الصحية السويسرية للكانتونات خفض م...,0,0,1,0,0,0,0,0,ar,79975
14658,باحثة مصرية تنجح في توسيع مجال التشخيص المعملي...,0,0,1,0,0,0,0,0,ar,74843


In [61]:
df_train

Unnamed: 0,text,sport,economics,society,science,culture,politics,auto,accidents,lang,__index_level_0__
0,"Мова йде про ""тред"" з порталу 4chan від 2016 р...",0,0,1,0,0,0,0,0,uk,686186
1,"Kuşadası Belediye Başkanı Ömer Günel, Kuşadası...",0,0,0,0,0,1,0,0,tr,613022
2,"Aile ve Sosyal Hizmetler Bakanı Derya Yanık, ""...",0,0,0,0,0,0,0,1,tr,660655
3,"Футболисты ""Сочи"" разгромили дома ""Ростов"", ко...",1,0,0,0,0,0,0,0,ru,493234
4,Відома українська співачка та модель Даша Аста...,0,0,0,0,1,0,0,0,uk,687559
...,...,...,...,...,...,...,...,...,...,...,...
646496,"Au marché deNoëldeMontpellier(Hérault), imposs...",0,0,0,0,1,0,0,0,fr,433864
646497,Разговора президента России Владимира Путина с...,0,0,0,0,0,1,0,0,ru,557077
646498,Gap is considering closing all of its UK store...,0,0,0,0,1,0,0,0,en,227796
646499,"München (dpa) - Der Spielfilm ""Ich bin dein Me...",0,0,0,0,1,0,0,0,de,163112


In [62]:
from sklearn.feature_extraction.text import HashingVectorizer
#from sklearn.feature_extraction.text import CountVectorizer

vectorizer = HashingVectorizer(n_features=10000)

In [63]:
lang_dict = {
    'ru': 'russian',
    'en': 'english',
    'fr': 'french',
    'de': 'german',
    'es': 'spanish',
    'uk': 'ukrainian',
    'ar': 'arabic',
    'tr': 'turkish'
}

def lang_fullname(df_col, lang_dict):
    
    result_df = []
    for elem in df_col:
        result_df.append(lang_dict[elem])
    
    return pd.DataFrame(result_df)
    
    

df_train['lang'] = lang_fullname(df_train['lang'], lang_dict)
df_valid['lang'] = lang_fullname(df_valid['lang'], lang_dict)
df_train['text'] = df_train['lang']  + ' : ' + df_train['text']
df_valid['text'] = df_valid['lang']  + ' : ' + df_valid['text']

In [64]:
df_train['text']

0         ukrainian : Мова йде про "тред" з порталу 4cha...
1         turkish : Kuşadası Belediye Başkanı Ömer Günel...
2         turkish : Aile ve Sosyal Hizmetler Bakanı Dery...
3         russian : Футболисты "Сочи" разгромили дома "Р...
4         ukrainian : Відома українська співачка та моде...
                                ...                        
646496    french : Au marché deNoëldeMontpellier(Hérault...
646497    russian : Разговора президента России Владимир...
646498    english : Gap is considering closing all of it...
646499    german : München (dpa) - Der Spielfilm "Ich bi...
646500    spanish : Sobre la carretera Torreón-San Pedro...
Name: text, Length: 646501, dtype: object

In [65]:
X_train_vector = vectorizer.fit_transform(df_train['text'])
X_test_vector = vectorizer.fit_transform(df_test['text'])
X_valid_vector = vectorizer.fit_transform(df_valid['text'])

In [66]:
target_rubric = 'science'

y_train = list(df_train[target_rubric])
y_test = list(df_test[target_rubric])
y_valid = list(df_valid[target_rubric])

In [67]:
threshold = y_train.count(1)
count_zeros = 0
list_indexes = []

for i in range(len(y_train)):
    if y_train[i] == 1:
        list_indexes.append(i)
    else:
        if count_zeros < threshold * 2: # 1 к 2 хорошее соотношение
            count_zeros += 1
            list_indexes.append(i)


x_cut = []
y_cut = []

for index in list_indexes:
    x_cut.append(df_train['text'][index])
    y_cut.append(y_train[index])
    
x_cut = vectorizer.fit_transform(x_cut)

In [68]:
len(y_cut)

231228

In [69]:
from sklearn.ensemble import RandomForestClassifier
import time

start_time = time.time()
clf = RandomForestClassifier(random_state=42, n_jobs=-1, bootstrap=False)
clf.fit(x_cut, y_cut)
end_time = time.time()
print('{:.2f} min - time to learning'.format((end_time - start_time) / 60))

9.82 min - time to learning


In [47]:
import time
from sklearn.metrics import classification_report

start_time = time.time()
y_pred = clf.predict(X_valid_vector)
end_time = time.time()

print(classification_report(y_valid, y_pred, digits=4))
print('{:.2f} sec - time to inference'.format(end_time - start_time))
print('Target rubric is', target_rubric)

              precision    recall  f1-score   support

           0     0.9919    0.9883    0.9901     61022
           1     0.9312    0.9516    0.9413     10186

    accuracy                         0.9830     71208
   macro avg     0.9616    0.9699    0.9657     71208
weighted avg     0.9832    0.9830    0.9831     71208

14.73 sec - time to inference
Target rubric is sport


In [50]:
import time
from sklearn.metrics import classification_report

start_time = time.time()
y_pred = clf.predict(X_valid_vector)
end_time = time.time()

print(classification_report(y_valid, y_pred, digits=4))
print('{:.2f} sec - time to inference'.format(end_time - start_time))
print('Target rubric is', target_rubric)

              precision    recall  f1-score   support

           0     0.9679    0.9669    0.9674     61280
           1     0.7968    0.8022    0.7995      9928

    accuracy                         0.9439     71208
   macro avg     0.8824    0.8845    0.8834     71208
weighted avg     0.9441    0.9439    0.9440     71208

12.16 sec - time to inference
Target rubric is economics


In [53]:
import time
from sklearn.metrics import classification_report

start_time = time.time()
y_pred = clf.predict(X_valid_vector)
end_time = time.time()

print(classification_report(y_valid, y_pred, digits=4))
print('{:.2f} sec - time to inference'.format(end_time - start_time))
print('Target rubric is', target_rubric)

              precision    recall  f1-score   support

           0     0.9569    0.9699    0.9634     61026
           1     0.8038    0.7385    0.7698     10182

    accuracy                         0.9368     71208
   macro avg     0.8804    0.8542    0.8666     71208
weighted avg     0.9351    0.9368    0.9357     71208

9.79 sec - time to inference
Target rubric is society


In [13]:
import time
from sklearn.metrics import classification_report

start_time = time.time()
y_pred = clf.predict(X_valid_vector)
end_time = time.time()

print(classification_report(y_valid, y_pred, digits=4))
print('{:.2f} sec - time to inference'.format(end_time - start_time))
print('Target rubric is', target_rubric)

              precision    recall  f1-score   support

           0     0.9763    0.9780    0.9772     61023
           1     0.8666    0.8580    0.8623     10185

    accuracy                         0.9608     71208
   macro avg     0.9215    0.9180    0.9197     71208
weighted avg     0.9606    0.9608    0.9607     71208

11.67 sec - time to inference
Target rubric is culture


In [20]:
import time
from sklearn.metrics import classification_report

start_time = time.time()
y_pred = clf.predict(X_valid_vector)
end_time = time.time()

print(classification_report(y_valid, y_pred, digits=4))
print('{:.2f} sec - time to inference'.format(end_time - start_time))
print('Target rubric is', target_rubric)

              precision    recall  f1-score   support

           0     0.9743    0.9647    0.9695     61117
           1     0.7984    0.8457    0.8214     10091

    accuracy                         0.9479     71208
   macro avg     0.8863    0.9052    0.8954     71208
weighted avg     0.9493    0.9479    0.9485     71208

11.44 sec - time to inference
Target rubric is politics


In [30]:
import time
from sklearn.metrics import classification_report

start_time = time.time()
y_pred = clf.predict(X_valid_vector)
end_time = time.time()

print(classification_report(y_valid, y_pred, digits=4))
print('{:.2f} sec - time to inference'.format(end_time - start_time))
print('Target rubric is', target_rubric)

              precision    recall  f1-score   support

           0     0.9956    0.9887    0.9921     66297
           1     0.8602    0.9412    0.8989      4911

    accuracy                         0.9854     71208
   macro avg     0.9279    0.9649    0.9455     71208
weighted avg     0.9863    0.9854    0.9857     71208

8.47 sec - time to inference
Target rubric is auto


In [37]:
import time
from sklearn.metrics import classification_report

start_time = time.time()
y_pred = clf.predict(X_valid_vector)
end_time = time.time()

print(classification_report(y_valid, y_pred, digits=4))
print('{:.2f} sec - time to inference'.format(end_time - start_time))
print('Target rubric is', target_rubric)

              precision    recall  f1-score   support

           0     0.9903    0.9794    0.9848     63778
           1     0.8387    0.9172    0.8762      7430

    accuracy                         0.9730     71208
   macro avg     0.9145    0.9483    0.9305     71208
weighted avg     0.9744    0.9730    0.9735     71208

10.10 sec - time to inference
Target rubric is accidents


In [61]:
import time
from sklearn.metrics import classification_report

start_time = time.time()
y_pred = clf.predict(X_valid_vector)
end_time = time.time()

print(classification_report(y_valid, y_pred, digits=4))
print('{:.2f} sec - time to inference'.format(end_time - start_time))
print('Target rubric is', target_rubric)

              precision    recall  f1-score   support

           0     0.9811    0.9805    0.9808     62913
           1     0.8530    0.8569    0.8549      8295

    accuracy                         0.9661     71208
   macro avg     0.9171    0.9187    0.9179     71208
weighted avg     0.9662    0.9661    0.9662     71208

11.01 sec - time to inference
Target rubric is science


In [14]:
import time
from sklearn.metrics import classification_report

start_time = time.time()
y_pred = clf.predict(X_valid_vector)
end_time = time.time()

print(classification_report(y_valid, y_pred, digits=4))
print('{:.2f} sec - time to inference'.format(end_time - start_time))
print('Target rubric is', target_rubric)

              precision    recall  f1-score   support

           0     0.9707    0.9894    0.9799     63270
           1     0.9085    0.7792    0.8389      8564

    accuracy                         0.9643     71834
   macro avg     0.9396    0.8843    0.9094     71834
weighted avg     0.9633    0.9643    0.9631     71834

1.81 sec - time to inference
Target rubric is science


In [16]:
import time
from sklearn.metrics import classification_report

start_time = time.time()
y_pred = clf.predict(X_valid_vector)
end_time = time.time()

print(classification_report(y_valid, y_pred, digits=4))
print('{:.2f} sec - time to inference'.format(end_time - start_time))
print('Target rubric is', target_rubric)

              precision    recall  f1-score   support

           0     0.9811    0.9811    0.9811     63270
           1     0.8605    0.8606    0.8605      8564

    accuracy                         0.9667     71834
   macro avg     0.9208    0.9208    0.9208     71834
weighted avg     0.9667    0.9667    0.9667     71834

1.42 sec - time to inference
Target rubric is science


In [29]:
# with ru :
import time
from sklearn.metrics import classification_report

start_time = time.time()
y_pred = clf.predict(X_valid_vector)
end_time = time.time()

print(classification_report(y_valid, y_pred, digits=4))
print('{:.2f} sec - time to inference'.format(end_time - start_time))
print('Target rubric is', target_rubric)

              precision    recall  f1-score   support

           0     0.9815    0.9811    0.9813     63270
           1     0.8608    0.8630    0.8619      8564

    accuracy                         0.9670     71834
   macro avg     0.9211    0.9221    0.9216     71834
weighted avg     0.9671    0.9670    0.9671     71834

1.35 sec - time to inference
Target rubric is science


In [56]:
# with RUSSIAN :
import time
from sklearn.metrics import classification_report

start_time = time.time()
y_pred = clf.predict(X_valid_vector)
end_time = time.time()

print(classification_report(y_valid, y_pred, digits=4))
print('{:.2f} sec - time to inference'.format(end_time - start_time))
print('Target rubric is', target_rubric)

              precision    recall  f1-score   support

           0     0.9818    0.9804    0.9811     63270
           1     0.8569    0.8657    0.8613      8564

    accuracy                         0.9668     71834
   macro avg     0.9194    0.9231    0.9212     71834
weighted avg     0.9669    0.9668    0.9668     71834

1.39 sec - time to inference
Target rubric is science


In [70]:
# with russian :
import time
from sklearn.metrics import classification_report

start_time = time.time()
y_pred = clf.predict(X_valid_vector)
end_time = time.time()

print(classification_report(y_valid, y_pred, digits=4))
print('{:.2f} sec - time to inference'.format(end_time - start_time))
print('Target rubric is', target_rubric)

              precision    recall  f1-score   support

           0     0.9818    0.9804    0.9811     63270
           1     0.8569    0.8657    0.8613      8564

    accuracy                         0.9668     71834
   macro avg     0.9194    0.9231    0.9212     71834
weighted avg     0.9669    0.9668    0.9668     71834

1.40 sec - time to inference
Target rubric is science


In [12]:
from xgboost import XGBClassifier
import time

start_time = time.time()
clf = XGBClassifier(random_state=42, n_jobs=-1, max_depth=6)
clf.fit(x_cut, y_cut)
end_time = time.time()

print('{:.2f} min - time to learning'.format((end_time - start_time) / 60))

1.94 min - time to learning


In [13]:
from sklearn.svm import LinearSVC
import time

start_time = time.time()
clf = LinearSVC(random_state=42)
clf.fit(x_cut, y_cut)
end_time = time.time()

print('{:.2f} min - time to learning'.format((end_time - start_time) / 60))

0.08 min - time to learning


In [15]:
from sklearn.linear_model import LogisticRegression
import time

start_time = time.time()
clf = LogisticRegression(random_state=42, n_jobs=-1)
clf.fit(x_cut, y_cut)
end_time = time.time()

print('{:.2f} min - time to learning'.format((end_time - start_time) / 60))

0.21 min - time to learning


In [16]:
import time
from sklearn.metrics import classification_report

start_time = time.time()
y_pred = clf.predict(X_valid_vector)
end_time = time.time()

print(classification_report(y_valid, y_pred, digits=4))
print('{:.2f} sec - time to inference'.format(end_time - start_time))
print('Target rubric is', target_rubric)

              precision    recall  f1-score   support

           0     0.9787    0.9644    0.9715     63270
           1     0.7625    0.8453    0.8017      8564

    accuracy                         0.9502     71834
   macro avg     0.8706    0.9048    0.8866     71834
weighted avg     0.9530    0.9502    0.9513     71834

0.04 sec - time to inference
Target rubric is science


In [75]:
from joblib import dump, load

dump(clf, 'models_ml/accidents_rf.joblib')

['models_ml/accidents_rf.joblib']

In [219]:
clf = load('models_ml/auto_rf.joblib')

In [233]:
df_train.loc[(df_train['lang'] == 'en') & (df_train['auto'] == 1)]

Unnamed: 0,text,sport,economics,society,science,culture,politics,auto,accidents,lang,__index_level_0__
12733,Hyundai Motor India on Monday teased the new i...,0,0,0,0,0,0,1,0,en,274895
15328,Motown is going back to basics. With less mone...,0,0,0,0,0,0,1,0,en,274648
20813,Trends in India's domestic automobile market a...,0,0,0,0,0,0,1,0,en,275535
21869,Volkswagen plans to make British luxury carmak...,0,0,0,0,0,0,1,0,en,274900
28041,Simple Energy was in the news recently when it...,0,0,0,0,0,0,1,0,en,275601
...,...,...,...,...,...,...,...,...,...,...,...
625998,The European Union has approved a plan that in...,0,0,0,0,0,0,1,0,en,275023
633569,The Indian subsidiary of Italian premium scoot...,0,0,0,0,0,0,1,0,en,275532
637533,With the festival season just round the corner...,0,0,0,0,0,0,1,0,en,274870
638770,Tesla CEO Elon Musk has confirmed that the EV ...,0,0,0,0,0,0,1,0,en,274740
