# 1. Dataset for emotion prediction

In [2]:
from sklearn.ensemble import AdaBoostClassifier
from matplotlib import pyplot as plt
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

'0': sadness
               '1': joy
               '2': love
               '3': anger
               '4': fear
               '5': surprise

In [None]:
#!pip install datasets

In [3]:
from datasets import load_dataset
dataset = load_dataset('json', data_files={'train': 'train.jsonl','test':'validation.jsonl'})

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [5]:
#!py -m pip install nltk

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

vectorizer = CountVectorizer(max_features=500, min_df=4, max_df=0.7, stop_words=stopwords.words('english'))
X_train_vec = vectorizer.fit_transform(dataset['train']['text'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Svetlana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
vectorizer.get_feature_names_out()

array(['able', 'absolutely', 'accepted', 'aching', 'actually', 'admit',
       'afraid', 'agitated', 'ago', 'almost', 'alone', 'already', 'also',
       'always', 'amazed', 'amazing', 'amp', 'angry', 'annoyed',
       'another', 'anxious', 'anymore', 'anyone', 'anything',
       'apprehensive', 'around', 'ashamed', 'ask', 'asked', 'assured',
       'away', 'awful', 'awkward', 'baby', 'back', 'bad', 'beaten',
       'beautiful', 'become', 'bed', 'began', 'believe', 'beloved',
       'best', 'better', 'big', 'bit', 'blank', 'blessed', 'blog', 'body',
       'book', 'books', 'bothered', 'brave', 'burdened', 'call', 'calm',
       'came', 'cannot', 'cant', 'care', 'caring', 'cause', 'certain',
       'change', 'child', 'children', 'class', 'close', 'cold', 'come',
       'comes', 'comfortable', 'coming', 'completely', 'confident',
       'confused', 'content', 'control', 'convinced', 'cool', 'could',
       'couldnt', 'cranky', 'creative', 'curious', 'cute', 'dangerous',
       'day', 'day

In [8]:
X_train_vec[:1].toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [9]:
X_train_vec[:1].toarray().sum()

2

In [10]:
dataset['train']['text'][:1]

['i didnt feel humiliated']

In [11]:
vocabulary = vectorizer.get_feature_names_out()

In [12]:
len(vocabulary)

500

In [13]:
from sklearn.feature_extraction.text import TfidfTransformer

In [14]:
tfidf = TfidfTransformer()

In [15]:
X_train_idf = tfidf.fit_transform(X_train_vec)

In [16]:
X_train_idf[:1].toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  

In [17]:
X_train_idf[:1].toarray().sum()

1.2295066506768701

In [18]:
X_test_vec = vectorizer.transform(dataset['test']['text'])

In [19]:
X_test_idf = tfidf.transform(X_test_vec)

In [20]:
X_test_idf[:1].toarray().sum()

2.52371815232768

In [21]:
X_train = X_train_idf.toarray()
X_test = X_test_idf.toarray()

In [22]:
X_train[0]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [23]:
X_test[0]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [137]:
X_train, X_test, y_train, y_test = X_train,X_test,dataset['train']['label'],\
                            dataset['test']['label']

# 2. AdaBoost

In [None]:
# обучение на объектах, неверно классифицированных на предыдущем шаге

In [114]:
cur_tree = DecisionTreeClassifier(random_state=42, max_depth = 6)
abc = AdaBoostClassifier(estimator = cur_tree,n_estimators=100,
                                 learning_rate=1.0) # DecisionTreeClassifier <-default
model = abc.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

0.5625


In [100]:
import pickle

filename = "./models/AdaBoost_100_6.pickle"
pickle.dump(model, open(filename, "wb"))

# 3. GradientBoosting

In [None]:
# обучение в направлн

In [102]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(n_estimators = 100,
                                         max_depth = 6, random_state=42)

model = gbc.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

0.645


In [104]:
filename = "./models/GBC_100_6.pickle"

pickle.dump(model, open(filename, "wb"))
#loaded_model = pickle.load(open(filename, "rb"))

In [62]:
X_train_0, X_train_1, y_train_0, y_train_1 = train_test_split(X_train, y_train, test_size=0.33, random_state=42)

# 3. Blending

In [63]:
from sklearn.ensemble import RandomForestClassifier

N = 10
y_pred_1 = []
crf = []
for n in range(1,N+1):
    crf.append(RandomForestClassifier(n_estimators = 2,max_depth=2, random_state=n))
    crf[-1].fit(X_train_0,y_train_0)
    y_pred_1.append(crf[-1].predict(X_train_1).reshape(len(X_train_1),1))y_pred_1t = np.array(y_pred_1).transpose()[0]
#y_pred_1 = hstack(y_pred_1)
len(y_pred_1t[0])

In [64]:
y_pred_1t = np.array(y_pred_1).transpose()[0]
#y_pred_1 = hstack(y_pred_1)
len(y_pred_1t[0])

10

In [65]:
len(y_pred_1t)

5280

In [66]:
clf_final = RandomForestClassifier(n_estimators = 10,max_depth=6, random_state=42)

In [67]:
clf_final.fit(y_pred_1t,y_train_1)

In [68]:
def make_blending_prediction(basic_clfs, final_clf, data):
    y0 = []
    for c in basic_clfs:
        y0.append(c.predict(data))
    y0_t = np.array(y0).transpose()
    return final_clf.predict(y0_t)

y_test_pred = make_blending_prediction(crf,clf_final, X_test)
metrics.accuracy_score(y_test, y_test_pred)

0.405

In [69]:
metrics.accuracy_score(y_test, crf[0].predict(X_test))

0.362

# 4. Stacking

In [154]:
from sklearn.model_selection import KFold
import numpy as np
from sklearn.ensemble import RandomForestClassifier

N = 10
y_pred_1 = []
crf_stack = []
kf = KFold(n_splits=N, random_state=None, shuffle=False)

x_test_2 = []
y_test_2 = []

pre_prediction = np.zeros((len(X_train), N))

for i, (train_index, test_index) in enumerate(kf.split(X_train)):
    X_train_0, X_test_1 = np.array(X_train)[train_index], np.array(X_train)[test_index]
    y_train_0, y_test_1 = np.array(y_train)[train_index], np.array(y_train)[test_index]
    
    crf_stack.append(RandomForestClassifier(n_estimators = 2,max_depth=2, random_state=i))
    crf_stack[-1].fit(X_train_0,y_train_0)
    pre_prediction[test_index,i]=crf_stack[-1].predict(X_test_1)

In [155]:
clf_stack_final = RandomForestClassifier(n_estimators = 10,max_depth=6, random_state=42)

In [156]:
clf_stack_final.fit(pre_prediction,y_train)

In [157]:
def make_stacking_prediction(basic_clfs, final_clf, data):
    y0 = []
    for c in basic_clfs:
        y0.append(c.predict(data))
    y0_t = np.array(y0).transpose()
    return final_clf.predict(y0_t)

y_test_pred = make_stacking_prediction(crf_stack,clf_stack_final, X_test)
metrics.accuracy_score(y_test, y_test_pred)

0.413

In [158]:
metrics.accuracy_score(y_test, crf_stack[0].predict(X_test))

0.368

In [None]:
# Задача. Варьируя параметры ансамбля в подходам блэндинг и стэкинг, обучить оптимальный классификатор.
# Визуализировать графики зависимости предсказания от параметров модели.