In [1]:
from nltk.corpus import movie_reviews
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score, GridSearchCV
import warnings
warnings.filterwarnings('ignore')

## Week 1

In [2]:
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
negfeats = [' '.join(movie_reviews.words(fileids=idx)) for idx in negids]
posfeats = [' '.join(movie_reviews.words(fileids=idx)) for idx in posids]

In [4]:
reviews = posfeats + negfeats
y = np.array([1] * len(posfeats) + [0] * len(negfeats))
print(f'Number of samples: {len(y)}')

Number of samples: 2000


In [10]:
def write_answer(answer, filename):
    with open(filename, 'w') as file:
        file.write(str(answer))

In [5]:
write_answer(len(y), '11.txt')

In [6]:
class_1_proportion = sum(y) / len(y)
print(f'Proportion of class 1: {class_1_proportion}')

Proportion of class 1: 0.5


In [9]:
write_answer(class_1_proportion, '12.txt')

In [10]:
count_vect = CountVectorizer()
X = count_vect.fit_transform(reviews)
print(f'Number of features {X.shape[1]}')

Number of features 39659


In [11]:
write_answer(X.shape[1], '13.txt')

In [12]:
# Строит частоты
X.todense()[0, np.where(count_vect.get_feature_names_out() == 'film')[0]]

matrix([[5]], dtype=int64)

In [13]:
pipe = Pipeline([('Vectorizer', CountVectorizer()), ('Estimator', LogisticRegression())])
cv_score_accuracy = np.mean(cross_val_score(pipe, reviews, y, scoring="accuracy"))
print(f'Cross validation accuracy score for box cv and lr: {cv_score_accuracy}')

Cross validation accuracy score for box cv and lr: 0.8424999999999999


In [14]:
write_answer(cv_score_accuracy, '14.txt')

In [15]:
cv_score_auc = np.mean(cross_val_score(pipe, reviews, y, scoring="roc_auc"))
print(f'Cross validation accuracy score for box cv and lr: {cv_score_auc}')

Cross validation accuracy score for box cv and lr: 0.9163599999999998


In [16]:
write_answer(cv_score_auc, '15.txt')

In [17]:
pipe.fit(reviews, y)

Pipeline(steps=[('Vectorizer', CountVectorizer()),
                ('Estimator', LogisticRegression())])

In [18]:
important_features_idx = np.argsort(np.abs(pipe['Estimator'].coef_[0]))[::-1]

In [19]:
important_features = np.array(pipe['Vectorizer'].get_feature_names())[important_features_idx]

In [20]:
print(f'Top 5 important features: {" ".join(important_features[:5])}')

Top 5 important features: bad unfortunately worst fun nothing


In [21]:
write_answer(" ".join(important_features[:2]), '16.txt')

## Week2

In [5]:
pipe1 = Pipeline([('Vectorizer', CountVectorizer()), ('Estimator', LogisticRegression())])
pipe2 = Pipeline([('Vectorizer', TfidfVectorizer()), ('Estimator', LogisticRegression())])
n_folds = 5
pipe1_cvscore = cross_val_score(pipe1, reviews, y, cv=n_folds)
pipe2_cvscore = cross_val_score(pipe2, reviews, y, cv=n_folds)
print(f'Pipeline1 mean score: {pipe1_cvscore.mean()}, standart deviation: {pipe1_cvscore.std()}')
print(f'Pipeline2 mean score: {pipe2_cvscore.mean()}, standart deviation: {pipe2_cvscore.std()}')

Pipeline1 mean score: 0.8424999999999999, standart deviation: 0.021794494717703363
Pipeline2 mean score: 0.8205, standart deviation: 0.003999999999999995


In [26]:
write_answer('0.8424999999999999 0.021794494717703363 0.8205 0.003999999999999995', '21.txt')

In [37]:
print(f'Params names: {pipe1.get_params().keys()}')

Params names: dict_keys(['memory', 'steps', 'verbose', 'Vectorizer', 'Estimator', 'Vectorizer__analyzer', 'Vectorizer__binary', 'Vectorizer__decode_error', 'Vectorizer__dtype', 'Vectorizer__encoding', 'Vectorizer__input', 'Vectorizer__lowercase', 'Vectorizer__max_df', 'Vectorizer__max_features', 'Vectorizer__min_df', 'Vectorizer__ngram_range', 'Vectorizer__preprocessor', 'Vectorizer__stop_words', 'Vectorizer__strip_accents', 'Vectorizer__token_pattern', 'Vectorizer__tokenizer', 'Vectorizer__vocabulary', 'Estimator__C', 'Estimator__class_weight', 'Estimator__dual', 'Estimator__fit_intercept', 'Estimator__intercept_scaling', 'Estimator__l1_ratio', 'Estimator__max_iter', 'Estimator__multi_class', 'Estimator__n_jobs', 'Estimator__penalty', 'Estimator__random_state', 'Estimator__solver', 'Estimator__tol', 'Estimator__verbose', 'Estimator__warm_start'])


In [32]:
params = {'Vectorizer__min_df': [10, 50]}
grid_count_vect = GridSearchCV(pipe1, params, cv=n_folds)
grid_count_vect.fit(reviews, y)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('Vectorizer', CountVectorizer()),
                                       ('Estimator', LogisticRegression())]),
             param_grid={'Vectorizer__min_df': [10, 50]})

In [35]:
grid_count_vect.cv_results_

{'mean_fit_time': array([1.16106958, 1.02700443]),
 'std_fit_time': array([0.04663813, 0.04403845]),
 'mean_score_time': array([0.19773579, 0.18773408]),
 'std_score_time': array([0.01041684, 0.00547273]),
 'param_Vectorizer__min_df': masked_array(data=[10, 50],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'params': [{'Vectorizer__min_df': 10}, {'Vectorizer__min_df': 50}],
 'split0_test_score': array([0.82  , 0.7875]),
 'split1_test_score': array([0.8525, 0.8275]),
 'split2_test_score': array([0.835 , 0.8125]),
 'split3_test_score': array([0.855, 0.82 ]),
 'split4_test_score': array([0.84  , 0.8225]),
 'mean_test_score': array([0.8405, 0.814 ]),
 'std_test_score': array([0.01268858, 0.01410674]),
 'rank_test_score': array([1, 2])}

In [36]:
write_answer('0.8405 0.814', '22.txt')

In [6]:
pipe3 = Pipeline([('Vectorizer', CountVectorizer()), ('Estimator', LinearSVC())])
pipe4 = Pipeline([('Vectorizer', CountVectorizer()), ('Estimator', SGDClassifier())])
# params = {'Estimator': [LinearSVC(), SGDClassifier(random_state=42)]}
# grid_estimators = GridSearchCV(pipe1, params, cv=n_folds)
# grid_estimators.fit(reviews, y)
pipe3_cvscore = cross_val_score(pipe3, reviews, y, cv=n_folds)
pipe4_cvscore = cross_val_score(pipe4, reviews, y, cv=n_folds)
print(f'Pipeline3 mean score: {pipe3_cvscore.mean()}, standart deviation: {pipe3_cvscore.std()}')
print(f'Pipeline4 mean score: {pipe4_cvscore.mean()}, standart deviation: {pipe4_cvscore.std()}')

Pipeline3 mean score: 0.8325000000000001, standart deviation: 0.0162788205960997
Pipeline4 mean score: 0.827, standart deviation: 0.020211382931407736


In [8]:
pipe4_cvscore # Худшее = 0.7875

array([0.7875, 0.835 , 0.8325, 0.845 , 0.835 ])

In [11]:
write_answer('0.7875', '23.txt')

In [75]:
params = {'Vectorizer__stop_words': [nltk.corpus.stopwords.words('english'), 'english']}
grid_stopwords = GridSearchCV(pipe1, params, cv=n_folds)
grid_stopwords.fit(reviews, y)
print(grid_stopwords.cv_results_)

{'mean_fit_time': array([1.61364355, 1.59900355]), 'std_fit_time': array([0.09827916, 0.08445906]), 'mean_score_time': array([0.18718324, 0.18978548]), 'std_score_time': array([0.00681331, 0.01259703]), 'param_Vectorizer__stop_words': masked_array(data=[list(['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in'

In [76]:
write_answer('0.84 0.8365', '24.txt')

In [78]:
vect1 = CountVectorizer(ngram_range=(1, 2), analyzer='word')
vect2 = CountVectorizer(ngram_range=(3, 5), analyzer='char_wb')
params = {'Vectorizer': [vect1, vect2]}
grid_ngrams = GridSearchCV(pipe1, params, cv=n_folds)
grid_ngrams.fit(reviews, y)
print(grid_ngrams.cv_results_)

{'mean_fit_time': array([18.14022183, 14.968784  ]), 'std_fit_time': array([1.92764282, 0.54493946]), 'mean_score_time': array([0.55553145, 2.59568973]), 'std_score_time': array([0.05094433, 0.40728407]), 'param_Vectorizer': masked_array(data=[CountVectorizer(ngram_range=(1, 2)),
                   CountVectorizer(analyzer='char_wb', ngram_range=(3, 5))],
             mask=[False, False],
       fill_value='?',
            dtype=object), 'params': [{'Vectorizer': CountVectorizer(ngram_range=(1, 2))}, {'Vectorizer': CountVectorizer(analyzer='char_wb', ngram_range=(3, 5))}], 'split0_test_score': array([0.82, 0.82]), 'split1_test_score': array([0.8575, 0.84  ]), 'split2_test_score': array([0.845 , 0.8175]), 'split3_test_score': array([0.87 , 0.825]), 'split4_test_score': array([0.885 , 0.8225]), 'mean_test_score': array([0.8555, 0.825 ]), 'std_test_score': array([0.02215852, 0.00790569]), 'rank_test_score': array([1, 2])}


In [79]:
write_answer('0.8555 0.825', '25.txt')