In [53]:
import pandas as pd
import numpy as np
import os
import json
import re
import csv
import string
import nltk
import ast
import re
import warnings

In [55]:
from tqdm import tqdm_notebook as tqdm
from pymystem3 import Mystem
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

In [3]:
RESOURCES_PATH = os.path.join(os.pardir, 'resources')

In [4]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger_ru')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/itukh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_ru to
[nltk_data]     /home/itukh/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_ru is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /home/itukh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
data = pd.read_csv(os.path.join(RESOURCES_PATH, 'all_train.csv'))

In [6]:
data.head()

Unnamed: 0,Text,Score,FinalScore1,Category
0,Оригинал взят у в Заложен пятый фрегат проекта...,"[1, 0, 0]",0,Post
1,Бодхисаттва устранения всяческой ущербности (С...,[0],0,Post
2,Выпущено два минометных снаряда. Несколько мин...,[-1],-1,Post
3,В пригороде Дамаска Джобар во время преследова...,[0],0,Post
4,Тема пенсионных реформ оказалась настолько жив...,"[0, 0, 0]",0,Post


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29442 entries, 0 to 29441
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Text         29442 non-null  object
 1   Score        29442 non-null  object
 2   FinalScore1  29442 non-null  int64 
 3   Category     29442 non-null  object
dtypes: int64(1), object(3)
memory usage: 920.2+ KB


In [8]:
data.groupby("Category").count()

Unnamed: 0_level_0,Text,Score,FinalScore1
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Comment,12738,12738,12738
Post,16704,16704,16704


In [9]:
processed_data_path = os.path.join(RESOURCES_PATH, 'processed_data.csv')
pd.read_csv(processed_data_path).head()

Unnamed: 0,Text,Score,FinalScore1,Category,! freq,) freq,:) freq,=) freq,( freq,:( freq,...,adjectives percent,verbs percent,emotional verbs,obscene words,average word length,exclamation mark count,question mark count,lemmas,average word sentiment,is post
0,Оригинал взят у в Заложен пятый фрегат проекта...,"[1, 0, 0]",0,Post,0,0,0,0,0,0,...,0.152542,0.084746,0,0,6.559322,0,0,"['оригинал', 'взять', 'закладывать', 'пятый', ...",0.0,True
1,Бодхисаттва устранения всяческой ущербности (С...,[0],0,Post,0,0,0,0,0,0,...,0.128655,0.134503,0,0,5.725146,0,0,"['бодхисаттва', 'устранение', 'всяческий', 'ущ...",0.020833,True
2,Выпущено два минометных снаряда. Несколько мин...,[-1],-1,Post,0,0,0,0,0,0,...,0.145455,0.090909,0,1,5.872727,0,0,"['выпускать', 'минометный', 'снаряд', 'несколь...",-0.05,True
3,В пригороде Дамаска Джобар во время преследова...,[0],0,Post,0,0,0,0,0,0,...,0.087432,0.114754,0,2,5.885246,0,0,"['пригород', 'дамаск', 'джобар', 'время', 'пре...",-0.03937,True
4,Тема пенсионных реформ оказалась настолько жив...,"[0, 0, 0]",0,Post,0,0,0,0,0,0,...,0.075,0.1625,0,0,5.0875,0,0,"['тема', 'пенсионный', 'реформа', 'оказываться...",0.02381,True


In [10]:
def load_obscene_words():
    with open(os.path.join(RESOURCES_PATH, 'obscene_words.txt'), 'r') as word_file:
        words = word_file.read().replace(',', ' ').lower()
        tokens = word_tokenize(words)
        obscene = set(tokens)
        obscene.remove('на')
        obscene.remove('не')
        obscene.remove('сила')
        return obscene

In [11]:
def load_emotional_verbs():
    emotional_words = pd.read_csv(os.path.join(RESOURCES_PATH, 'verbs_emotional.csv'))
    trusted = emotional_words[emotional_words['emotional'] == '+']
    return set(trusted['verb'].apply(lambda word: word_tokenize(word.lower())[0]).to_numpy())

In [12]:
def load_words_sentiment_dictionary():
    dictionary_path = os.path.join(RESOURCES_PATH, 'words_all_full_rating_utf_8.csv')
    words_sentiment = {}
    with open(dictionary_path) as dict_file:
        sentiment_reader = csv.DictReader(dict_file, delimiter=';', quotechar='"')
        for sentiment_row in sentiment_reader:
            word = sentiment_row['Words']
            words_sentiment[word] = float(sentiment_row['average rate'])

    return words_sentiment

def get_word_sentiment(sentiment_dict, word):
    return 0.0 if word not in sentiment_dict else sentiment_dict[word]

In [13]:
def has_uppercase_word(text):
    for token in text.split(' '):
        if token.isupper() and len(token) > 1:
            return True
    return False

def get_pos_percent(pos_predicate, poses):
    try:
        parsed_poses = ast.literal_eval(poses)
    except:
        parsed_poses = poses
    pos_count = len([1 for p in parsed_poses if pos_predicate(p[1])])
    return 1. * pos_count / len(parsed_poses)

def get_dictionary_count(poses, dictionary):
    try:
        parsed_poses = ast.literal_eval(poses)
    except:
        parsed_poses = poses
    return len([1 for p in parsed_poses if p in dictionary])

def get_average_word_length(tokens):
    try:
        parsed_tokens = ast.literal_eval(tokens)
    except:
        parsed_tokens = tokens
    return np.mean(list(map(len, parsed_tokens)))

def get_lemmas(text, mystem, stop_words):
    # text = re.sub(r'[^\w\s]', '', text.lower())
    tokens = mystem.lemmatize(text.lower())
    return [token for token in tokens if token not in stop_words and token != " "
            and token.strip() not in string.punctuation]

def get_sentiment(sentiment_dict, lemmas):
    try:
        parsed_lemmas = ast.literal_eval(lemmas)
    except:
        parsed_lemmas = lemmas
    return np.mean([get_word_sentiment(sentiment_dict, word) for word in parsed_lemmas])

class TextualFeaturesExtracter:
    def __init__(self, data):
        self.data = data.copy()
        self.X = None
        self.y = None
        self.scores = None
        self.emotional_verbs = load_emotional_verbs()
        self.obscene_words = load_obscene_words()
        self.sentiment_dict = load_words_sentiment_dictionary()
        self.mystem = Mystem()
        self.stop_words = set(stopwords.words("russian"))
        self._build_data()
        
    def _build_data(self):
        processed_data_path = os.path.join(RESOURCES_PATH, 'processed_data.csv')
        if not os.path.exists(processed_data_path):
            self.data['! freq'] = self.data['Text'].apply(lambda x: x.count('!'))

            # ':‑\)' and ':‑\(' do not appear in the train dataset
            for emoticon in ['\) ', ':\)', '=\)', '\( ', ':\(', '=\(']:
                emoticon_str = emoticon.replace('\\', '')
                self.data[f'{emoticon_str} freq'] = self.data['Text'].apply(lambda x: len(re.findall(rf'{emoticon}', x)))

            self.data['upper case'] = self.data['Text'].apply(has_uppercase_word)
            self.data['tokens'] = self.data['Text'].apply(lambda text: word_tokenize(text.lower()))
            self.data['pos'] = self.data['tokens'].apply(lambda tokens: pos_tag(tokens, lang='rus'))
            
            self.data['lemmas'] = self.data['Text'].apply(
                lambda text: get_lemmas(text, self.mystem, self.stop_words))
            
            self.data['nouns percent'] = self.data['pos'].apply(
                lambda poses: get_pos_percent(lambda p: p == 'S', poses))
            self.data['adjectives percent'] = self.data['pos'].apply(
                lambda poses: get_pos_percent(lambda p: p[0] == 'A' and (len(p) == 1 or p[1] == '='), poses))
            self.data['verbs percent'] = self.data['pos'].apply(
                lambda poses: get_pos_percent(lambda p: p == 'V', poses))
            self.data['exclamation mark count'] = self.data['Text'].apply(
                lambda text: text.count('!'))
            self.data['question mark count'] = self.data['Text'].apply(
                lambda text: text.count('?'))
            self.data['emotional verbs'] = self.data['lemmas'].apply(
                lambda p: get_dictionary_count(p, self.emotional_verbs))
            self.data['obscene words'] = self.data['tokens'].apply(
                lambda p: get_dictionary_count(p, self.obscene_words))
            self.data['average word length'] = self.data['tokens'].apply(
                get_average_word_length)
            self.data['average word sentiment'] = self.data['lemmas'].apply(
                lambda lemmas: get_sentiment(self.sentiment_dict, lemmas))
            self.data['is post'] = self.data['Category'].apply(
                lambda c: c == 'Post')

            
            with open(processed_data_path, 'w') as csv_file:
                csv_file.write(self.data.to_csv(index=False))    
        
        self.processed_data = pd.read_csv(processed_data_path)
#         with open(processed_data_path, 'w') as csv_file:
#                 csv_file.write(self.processed_data.to_csv(index=False))
        
        self.scores = self.processed_data['Score'].apply(json.loads)
        self.y = self.processed_data['FinalScore1'].to_numpy()
        self.X = self.processed_data.copy().drop(columns=['Text', 'FinalScore1', 'Score',
                                                         'tokens', 'lemmas', 'pos', 'Category'])
    
    def get_Xy(self):
        return self.X, self.y
    
    def get_scores(self):
        return self.scores
    
    def get_texts(self):
        return self.data['Text'].to_numpy()

In [14]:
builder = TextualFeaturesExtracter(data)
X, y = builder.get_Xy()
texts = builder.get_texts()

In [15]:
X.head()

Unnamed: 0,! freq,) freq,:) freq,=) freq,( freq,:( freq,=( freq,upper case,nouns percent,adjectives percent,verbs percent,emotional verbs,obscene words,average word length,exclamation mark count,question mark count,average word sentiment,is post
0,0,0,0,0,0,0,0,True,0.389831,0.152542,0.084746,0,0,6.559322,0,0,0.0,True
1,0,0,0,0,0,0,0,False,0.25731,0.128655,0.134503,0,0,5.725146,0,0,0.020833,True
2,0,0,0,0,0,0,0,False,0.4,0.145455,0.090909,0,1,5.872727,0,0,-0.05,True
3,0,0,0,0,0,0,0,True,0.453552,0.087432,0.114754,0,2,5.885246,0,0,-0.03937,True
4,0,0,0,0,0,0,0,False,0.175,0.075,0.1625,0,0,5.0875,0,0,0.02381,True


In [16]:
RANDOM_STATE = 23923

In [17]:
warnings.filterwarnings('ignore')

In [18]:
X_train, X_test, y_train, y_test, texts_train, texts_test = train_test_split(X, y, texts, test_size=0.2, 
                                                            shuffle=True, random_state=RANDOM_STATE)

## Basic Logistic Regression Model

In [19]:
clf = LogisticRegression(random_state=RANDOM_STATE, max_iter=2000).fit(X_train, y_train)
print('Train accuracy:', clf.score(X_train, y_train))
print('Train macro F1:', f1_score(y_train, clf.predict(X_train), average='macro'))
print('Test accuracy:', clf.score(X_test, y_test))
print('Test macro F1:', f1_score(y_test, clf.predict(X_test), average='macro'))

Train accuracy: 0.5878232072347471
Train macro F1: 0.4318736251053181
Test accuracy: 0.5919510952623536
Test macro F1: 0.4327507860941841


## Basic Logistic Regression Model + TF-IDF

In [45]:
vectorizer = TfidfVectorizer(min_df=10, max_df=0.5, max_features=1000)
X_texts_train = vectorizer.fit_transform(texts_train)
X_texts_test = vectorizer.transform(texts_test)

In [63]:
X_train_combined = np.hstack((X_train.to_numpy(), X_texts_train.toarray()))
X_test_combined = np.hstack((X_test.to_numpy(), X_texts_test.toarray()))

In [64]:
clf = LogisticRegression(random_state=RANDOM_STATE, max_iter=2000).fit(X_train_combined, y_train)
print('Train accuracy:', clf.score(X_train_combined, y_train))
print('Train macro F1:', f1_score(y_train, clf.predict(X_train_combined), average='macro'))
print('Test accuracy:', clf.score(X_test_combined, y_test))
print('Test macro F1:', f1_score(y_test, clf.predict(X_test_combined), average='macro'))

Train accuracy: 0.6523585105931304
Train macro F1: 0.5276122600375901
Test accuracy: 0.6130073017490236
Test macro F1: 0.4797736508285406
