In [1]:
import warnings
warnings.filterwarnings("ignore")

In [7]:
import numpy as np
import pandas as pd

import copy
import traceback
import datetime
import joblib
import re
import os
import random
import string

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import matplotlib.pyplot as plt
%matplotlib inline

import torch
from torch import nn
from torch.nn import functional as F

from tqdm.notebook import tqdm

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import pymorphy2
from sklearn.base import BaseEstimator, TransformerMixin

import sqlite3
import sqlalchemy

In [3]:
class Config:
    seed = 42
    sql_file = "../data/db.sql"
    positive_file = "../data/positive.csv"
    negative_file = "../data/negative.csv"
    test_size = 0.3
    
config = Config()

In [4]:
def init_random_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic=True

In [5]:
init_random_seed(config.seed)

In [62]:
column_names = ["id", "tdate", "tmane", "ttext", "ttype", "trep", "tfav", "tstcount", "tfol", "tfrien", "listcount"]
positive_df = pd.read_csv(config.positive_file, sep=";", names=column_names, index_col=False)
negative_df = pd.read_csv(config.negative_file, sep=";", names=column_names, index_col=False)

In [63]:
positive_df.head()

Unnamed: 0,id,tdate,tmane,ttext,ttype,trep,tfav,tstcount,tfol,tfrien,listcount
0,408906692374446080,1386325927,pleease_shut_up,"@first_timee хоть я и школота, но поверь, у на...",1,0,0,0,7569,62,61
1,408906692693221377,1386325927,alinakirpicheva,"Да, все-таки он немного похож на него. Но мой ...",1,0,0,0,11825,59,31
2,408906695083954177,1386325927,EvgeshaRe,RT @KatiaCheh: Ну ты идиотка) я испугалась за ...,1,0,1,0,1273,26,27
3,408906695356973056,1386325927,ikonnikova_21,"RT @digger2912: ""Кто то в углу сидит и погибае...",1,0,1,0,1549,19,17
4,408906761416867842,1386325943,JumpyAlex,@irina_dyshkant Вот что значит страшилка :D\nН...,1,0,0,0,597,16,23


In [64]:
positive_df.shape

(114911, 11)

In [8]:
%%time

con = sqlite3.connect("../data/full_dataset.db")

# Load the data into a DataFrame
df = pd.read_sql_query("SELECT * from sentiment", con)

CPU times: user 1min 41s, sys: 6.86 s, total: 1min 48s
Wall time: 1min 48s


In [9]:
df.head()

Unnamed: 0,id,tdate,tname,ttext,ttype,trep,trtw,tfav,tstcount,tfoll,tfrien,listcount
0,408906691322073088,1386325926,AndreyAF,Грустный суворовец с пятым айфоном.,2,0,0,0,25914,175,113,23
1,408906691338854400,1386325927,kydijamipam,RT @juvivenofara: ненавижу 10рублевый металл.....,2,0,2,0,372,67,63,0
2,408906691347226624,1386325927,zgamaset,RT @tourskidki_ru: Тур во Вьетнам (Нячанг) на ...,2,0,3,0,2738,2779,2979,3
3,408906691414331392,1386325927,my_wedding_ua,рппр,2,0,0,0,1,0,2,0
4,408906691430739968,1386325927,Ronisan12,"Добралась до ночлега, не хоромы, но переночева...",2,0,0,0,1347,12,6,0


In [10]:
df.shape

(17639674, 12)

In [13]:
df.to_pickle("full_dataset.pkl")

In [14]:
df = pd.read_pickle("full_dataset.pkl")

In [15]:
df.head()

Unnamed: 0,id,tdate,tname,ttext,ttype,trep,trtw,tfav,tstcount,tfoll,tfrien,listcount
0,408906691322073088,1386325926,AndreyAF,Грустный суворовец с пятым айфоном.,2,0,0,0,25914,175,113,23
1,408906691338854400,1386325927,kydijamipam,RT @juvivenofara: ненавижу 10рублевый металл.....,2,0,2,0,372,67,63,0
2,408906691347226624,1386325927,zgamaset,RT @tourskidki_ru: Тур во Вьетнам (Нячанг) на ...,2,0,3,0,2738,2779,2979,3
3,408906691414331392,1386325927,my_wedding_ua,рппр,2,0,0,0,1,0,2,0
4,408906691430739968,1386325927,Ronisan12,"Добралась до ночлега, не хоромы, но переночева...",2,0,0,0,1347,12,6,0


In [16]:
df.shape

(17639674, 12)