In [1]:
import numpy as np
import pandas as pd
from collections import Counter

In [2]:
from sklearn.metrics import *
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
import nltk
from nltk import ngrams
from nltk.corpus import stopwords
from string import punctuation
import re
from nltk.tokenize import word_tokenize

In [4]:
import warnings
warnings.filterwarnings("ignore")

# Задание 1 

In [5]:
positive = pd.read_csv('positive.csv', sep=';', usecols=[3], names=['text'])
positive['label'] = ['positive'] * len(positive)
negative = pd.read_csv('negative.csv', sep=';', usecols=[3], names=['text'])
negative['label'] = ['negative'] * len(negative)
df = positive.append(negative)

In [6]:
def del_sub(text, pattern):
    r = re.findall(pattern, text)
    for i in r:
        text = re.sub(i, '', text)
    return text

df["text"] = np.vectorize(del_sub)(df['text'], "@[\w]*")
df["text"] = np.vectorize(del_sub)(df['text'], "RT")

In [7]:
def low_words(tweet):
    
    for word in tweet.split():
        tweet = tweet.lower()
    return tweet

df['text'] = df['text'].apply(low_words)

In [8]:
df.head()

Unnamed: 0,text,label
0,"хоть я и школота, но поверь, у нас то же само...",positive
1,"да, все-таки он немного похож на него. но мой ...",positive
2,: ну ты идиотка) я испугалась за тебя!!!,positive
3,": ""кто то в углу сидит и погибает от голода, ...",positive
4,"вот что значит страшилка :d\nно блин,посмотре...",positive


In [9]:
noise = stopwords.words('russian') + list(punctuation)

In [10]:
corpus = [token for tweet in df.text for token in word_tokenize(tweet) if token not in noise]
print(len(corpus))
corpus[:10]

1710589


['школота',
 'поверь',
 'самое',
 'd',
 'общество',
 'профилирующий',
 'предмет',
 'типа',
 'все-таки',
 'немного']

In [11]:
freq_dict = Counter(corpus)
freq_dict_sorted = sorted(freq_dict.items(), key=lambda x: -x[1])

In [12]:
len(freq_dict_sorted)

235596

In [13]:
high_freq = freq_dict_sorted[:47119]       # 0-20%
aver_freq = freq_dict_sorted[47119:141357] # 20-60%
low_freq = freq_dict_sorted[141357:]       # 60-100%

print(len(high_freq), len(aver_freq), len(low_freq))

47119 94238 94239


In [14]:
high = []
for i in range(47119):
    token = high_freq[i][0]
    high.append(token)  

aver = []
for i in range(94238):
    token = aver_freq[i][0]
    aver.append(token)
    
low = []
for i in range(94239):
    token = low_freq[i][0]
    low.append(token)

### 1) обучение на токенах с высокой частотой

In [15]:
df_1 = df

In [16]:
def match_words1(words):
    
    for word in words.split():
        if word  in high:
            words = word
            break
        if word not in high:
            words = ''
            
    return words

df_1['text'] = df['text'].apply(match_words1)

In [17]:
df_1 = df_1.loc[df_1['text'] != '']
df_1.head()

Unnamed: 0,text,label
0,самое,positive
1,все-таки,positive
2,испугалась,positive
3,углу,positive
4,значит,positive


In [18]:
X = df_1.text
y = df_1.label

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [20]:
vec = CountVectorizer(ngram_range=(3, 3))
bow = vec.fit_transform(X_train)
clf = LogisticRegression(random_state=16)
clf.fit(bow, y_train)
pred = clf.predict(vec.transform(X_test))
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

    negative       0.00      0.00      0.00         0
    positive       1.00      0.51      0.67     54617

    accuracy                           0.51     54617
   macro avg       0.50      0.25      0.34     54617
weighted avg       1.00      0.51      0.67     54617



### 2) обучение на токенах со средней частотой

In [22]:
df_2 = df

In [42]:
def match_words2(words):
    
    for word in words.split():
        if word  in aver:
            words = word
            break
        if word not in aver:
            words = ''
            
    return words

df_2['text'] = df['text'].apply(match_words2)

In [43]:
df_2 = df_2.loc[df_2['text'] != '']
df_2.head()

Unnamed: 0,text,label
0,профилирующий,positive
4,страшилка,positive
6,"ох,900",positive
9,сиднея,positive
15,аспирантуру,positive


In [44]:
X = df_2.text
y = df_2.label

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [46]:
vec = CountVectorizer(ngram_range=(3, 3))
bow = vec.fit_transform(X_train)
clf = LogisticRegression(random_state=16)
clf.fit(bow, y_train)
pred = clf.predict(vec.transform(X_test))
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

    negative       0.00      1.00      0.00         2
    positive       1.00      0.66      0.80     14268

    accuracy                           0.66     14270
   macro avg       0.50      0.83      0.40     14270
weighted avg       1.00      0.66      0.80     14270



### 3) обучение на токенах с низкой частотой

In [15]:
df_3 = df

In [16]:
def match_words3(words):
    
    for word in words.split():
        if word  in low:
            words = word
            break
        if word not in low:
            words = ''
            
    return words

df_3['text'] = df['text'].apply(match_words3)

In [17]:
df_3 = df_3.loc[df_3['text'] != '']
df_3.head()

Unnamed: 0,text,label
60,т.е.,positive
71,знала.,positive
135,",а",positive
292,автомобиль.,positive
534,:сс,positive


In [18]:
X = df_3.text
y = df_3.label

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [20]:
vec = CountVectorizer(ngram_range=(1, 1))
bow = vec.fit_transform(X_train)
clf = LogisticRegression(random_state=16)
clf.fit(bow, y_train)
pred = clf.predict(vec.transform(X_test))
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

    negative       0.99      0.69      0.81      9470
    positive       0.01      0.35      0.02        80

    accuracy                           0.69      9550
   macro avg       0.50      0.52      0.42      9550
weighted avg       0.98      0.69      0.81      9550



#### Как видно из показателей, лучшие результаты получились на обучении на токенах с низкой и средней частотой.