In [32]:
import pandas as pd
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, preprocessing, linear_model
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")

In [33]:
df: pd.DataFrame | None = None
with open('../lesson_1/data.pickle', 'rb') as f:
    df = pickle.load(f)

In [34]:
count_vectorizer = CountVectorizer(
    ngram_range=(1, 1), analyzer='word', binary=False, max_df=0.9, max_features=1000, stop_words='english'
)
tfidf_vectorizer = TfidfVectorizer(
    ngram_range=(1, 1), analyzer='word', binary=False, max_df=0.9, max_features=1000, stop_words='english'
)

In [35]:
df['tweet_stemmed'] = df['tweet_stemmed'].apply(lambda x: ' '.join(x))
df['tweet_lemmatized'] = df['tweet_lemmatized'].apply(lambda x: ' '.join(x))

In [36]:
bag_of_words_stemmed_cv = count_vectorizer.fit_transform(df['tweet_stemmed'])
feature_names_stemmed_cv = count_vectorizer.get_feature_names_out()
df_stemmed_cv = pd.DataFrame(bag_of_words_stemmed_cv.toarray(), columns = feature_names_stemmed_cv)
df_stemmed_cv.head()

Unnamed: 0,abl,absolut,abt,abus,accept,account,act,action,activ,actor,...,year,yesterday,yo,yoga,young,youtub,yoyou,yoyour,yr,yummi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
bag_of_words_lemmatized_cv = count_vectorizer.fit_transform(df['tweet_lemmatized'])
feature_names_lemmatized_cv = count_vectorizer.get_feature_names_out()
df_lemmatized_cv = pd.DataFrame(bag_of_words_lemmatized_cv.toarray(), columns = feature_names_lemmatized_cv)
df_lemmatized_cv.head()

Unnamed: 0,abl,absolut,abt,abus,accept,account,act,action,activ,actor,...,yesterday,yo,yoga,york,young,youtub,yoyou,yoyour,yr,yummi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
bag_of_words_stemmed_tfidf = tfidf_vectorizer.fit_transform(df['tweet_stemmed'])
feature_names_stemmed_tfidf = tfidf_vectorizer.get_feature_names_out()
df_stemmed_tfidf = pd.DataFrame(bag_of_words_stemmed_tfidf.toarray(), columns = feature_names_stemmed_tfidf)
df_stemmed_tfidf.head()

Unnamed: 0,abl,absolut,abt,abus,accept,account,act,action,activ,actor,...,year,yesterday,yo,yoga,young,youtub,yoyou,yoyour,yr,yummi
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
bag_of_words_lemmatized_tfidf = tfidf_vectorizer.fit_transform(df['tweet_lemmatized'])
feature_names_lemmatized_tfidf = tfidf_vectorizer.get_feature_names_out()
df_lemmatized_tfidf = pd.DataFrame(bag_of_words_lemmatized_tfidf.toarray(), columns = feature_names_lemmatized_tfidf)
df_lemmatized_tfidf.head()

Unnamed: 0,abl,absolut,abt,abus,accept,account,act,action,activ,actor,...,yesterday,yo,yoga,york,young,youtub,yoyou,yoyour,yr,yummi
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
# Загружаем данные
data = open('../data/corpus').read()
labels, texts = [], []
for i, line in enumerate(data.split("\n")):
    content = line.split()
    labels.append(content[0])
    texts.append(" ".join(content[1:]))

# создаем df
trainDF = pd.DataFrame()
trainDF['text'] = texts
trainDF['label'] = labels
trainDF.head(5)

Unnamed: 0,text,label
0,Stuning even for the non-gamer: This sound tra...,__label__2
1,The best soundtrack ever to anything.: I'm rea...,__label__2
2,Amazing!: This soundtrack is my favorite music...,__label__2
3,Excellent Soundtrack: I truly like this soundt...,__label__2
4,"Remember, Pull Your Jaw Off The Floor After He...",__label__2


In [41]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'])

encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [43]:
results = {
    'vectorizer': [],
    'ngram': [],
    'features': [],
    'accuracy': []
}

In [44]:
%%time
models = ['count', 'tfidf']
ngrams = [(1,1), (1,2), (1,3), (2,2), (2,3), (3,3)]
features = [100, 200, 500, 1000, 2000, 5000, 10000]

CPU times: user 0 ns, sys: 35 µs, total: 35 µs
Wall time: 51.5 µs


In [46]:
for model in models:
    for ngram in ngrams:
        for value in features:
            vectorizers={
                'count': CountVectorizer(
                    ngram_range=ngram,
                    analyzer='word',
                    binary=False,
                    max_df=0.9,
                    max_features=value,
                    stop_words='english'
                ),
                'tfidf': TfidfVectorizer(
                    ngram_range=ngram,
                    analyzer='word',
                    binary=False,
                    max_df=0.9,
                    max_features=value,
                    stop_words='english'
                )
            }


            vectorizer = vectorizers[model]
            vectorizer.fit(trainDF['text'])
            xtrain_vec =  vectorizer.transform(train_x)
            xvalid_vec =  vectorizer.transform(valid_x)

            classifier = linear_model.LogisticRegression()
            classifier.fit(xtrain_vec, train_y)
            predictions = classifier.predict(xvalid_vec)

            results['vectorizer'].append(model)
            results['ngram'].append(ngram)
            results['features'].append(value)
            results['accuracy'].append(accuracy_score(valid_y, predictions))

In [47]:
pd.DataFrame(results).sort_values(by='accuracy', ascending=False)

Unnamed: 0,vectorizer,ngram,features,accuracy
62,tfidf,"(1, 3)",10000,0.8724
55,tfidf,"(1, 2)",10000,0.8716
48,tfidf,"(1, 1)",10000,0.8696
61,tfidf,"(1, 3)",5000,0.8676
54,tfidf,"(1, 2)",5000,0.8672
...,...,...,...,...
79,tfidf,"(3, 3)",500,0.5408
78,tfidf,"(3, 3)",200,0.5288
36,count,"(3, 3)",200,0.5280
35,count,"(3, 3)",100,0.5236


Наилучшие результаты показывает векторайзер tfidf при большем количестве фичей. Так же стоит обратить внимание на то что лучшие показатели наблюдаются при использовании ngram в сочетании от 1 и до 3. Думаю при использовании ngram более 3-х показатели вырастут.