# GeekBrains
## Машинное обучение в бизнесе
## ДЗ Урока 6 (Задача look-alike)
## Виталий Казанцев

### Домашнее задание

1. взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)
3. сделать feature engineering
4. обучить любой классификатор (какой вам нравится)
5. далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть
6. применить random negative sampling для построения классификатора в новых условиях
7. сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик)
8. поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)

In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score

__TASK 1__  
https://www.kaggle.com/datasets/jcblaise/imdb-sentiments

In [2]:
df = pd.read_csv('../data/train.csv')
df.sample(5)

Unnamed: 0,text,sentiment
650,The number of times I've had tears in my eyes ...,0
1153,"I already loved ""How the Grinch Stole Christma...",0
18400,This very low budget comedy caper movie succee...,1
22125,I really wanted to like this movie because the...,1
1124,Definitely one of the lesser of the Astaire/Ro...,0


In [3]:
df.shape

(25000, 2)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       25000 non-null  object
 1   sentiment  25000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 390.8+ KB


__TASK 2__  
feature engineering в выбранных мною данных не нужен, а вот препроцессинг провести стоит

In [5]:
import re
corpus = []
for i in range(0, 25000):
    review = re.sub(r'\W', ' ', str(df["text"][i]))
    review = review.lower()
    review = re.sub(r'^br$', ' ', review)
    review = re.sub(r'\s+br\s+',' ',review)
    review = re.sub(r'\s+[a-z]\s+', ' ',review)
    review = re.sub(r'^b\s+', '', review)
    review = re.sub(r'\s+', ' ', review)
    corpus.append(review)

In [6]:
vectorizer = TfidfVectorizer(max_features = 2000, min_df = 3, max_df = 0.6, stop_words = stopwords.words('english'))
X = vectorizer.fit_transform(corpus).toarray()

In [7]:
train_x, test_x, train_y, test_y = train_test_split(X, df['sentiment'])
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)

__TASK 3__

In [8]:
classifier = LogisticRegression()
classifier.fit(train_x,train_y)

LogisticRegression()

In [9]:
def model_quality(model, train_x, train_y, test_x, test_y):
    model.fit(train_x, train_y)
    y_pred = model.predict(test_x)
    
    f1 = f1_score(test_y, y_pred)
    roc = roc_auc_score(test_y, y_pred)
    prec = precision_score(test_y, y_pred, average='binary')
    rec = recall_score(test_y, y_pred, average='binary')
    
    return {'f1': [f1], 'roc-auc': [roc], 'precision': [prec], 'recall': [rec]}

In [10]:
metrics = []
metrics.append({'normal': model_quality(classifier, train_x, train_y, test_x, test_y)})
metrics

[{'normal': {'f1': [0.8667736757624398],
   'roc-auc': [0.8672430728446006],
   'precision': [0.8743523316062176],
   'recall': [0.8593252705283259]}}]

__TASK 4__

In [11]:
pos_frac = 0.2
sdf = df.copy()
pos_mask = (df['sentiment'] == 1)
pos_ind = df[pos_mask].sample(frac=pos_frac).index
unlab_ind = df[~df.index.isin(pos_ind)].index
    
# Помечаем данные признаком is_labeled - Positive = 1, Unlabeled = 0
sdf.loc[pos_ind, 'is_labeled'] = 1
sdf.loc[unlab_ind, 'is_labeled'] = 0
sdf['is_labeled'] = sdf['is_labeled'].astype(int)

In [12]:
sdf.head(5)

Unnamed: 0,text,sentiment,is_labeled
0,For a movie that gets no respect there sure ar...,0,0
1,Bizarre horror movie filled with famous faces ...,0,0
2,"A solid, if unremarkable film. Matthau, as Ein...",0,0
3,It's a strange feeling to sit alone in a theat...,0,0
4,"You probably all already know this by now, but...",0,0


__TASK 5__

In [13]:
def get_rns_samples(rns_df):
    """
    Создает тренировочную и тестовую выборки для RNS на основе признака is_labeled.
    """
    rns_df = rns_df.sample(frac=1)

    pos_sample = rns_df[rns_df['is_labeled'] == 1]
    neg_sample = rns_df[rns_df['is_labeled'] == 0][:pos_sample.shape[0]]
    train_samples = pd.concat([neg_sample, pos_sample]).sample(frac=1)
    test_samples = rns_df[rns_df['is_labeled'] == 0][pos_sample.shape[0]:]
    
    return train_samples, test_samples

In [14]:
train_samples, test_samples = get_rns_samples(sdf)

In [15]:
train_samples.shape, test_samples.shape

((5000, 3), (20000, 3))