In [87]:
import json
import os, re
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.linear_model import SGDClassifier,LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [88]:
from sklearn.preprocessing import LabelEncoder

## Load data and preprocess

In [89]:
df = pd.read_csv('case_dataset_1.csv')

In [90]:
df.shape

(7931, 4)

In [91]:
df = df.dropna()

In [92]:
df.shape

(7922, 4)

In [93]:
transformed_df = df.copy()

In [94]:
le = LabelEncoder()
le.fit(transformed_df.theme)
transformed_df['theme'] = le.transform(transformed_df.theme)

le = LabelEncoder()
le.fit(transformed_df.category)
transformed_df['category'] = le.transform(transformed_df.category)

le = LabelEncoder()
le.fit(transformed_df.sphere)
transformed_df['sphere'] = le.transform(transformed_df.sphere)

In [95]:
transformed_df.head()

Unnamed: 0,sphere,category,theme,text
0,7,55,125,обращаются жильцы можем заверить что и др...
1,33,139,201,обращаюсь лица моей бабушки прожива...
2,27,124,119,связи коронавирусом в нашей стране и в нашем ...
3,9,45,192,больнице тракторной отсутствует горячая вода ...
4,28,63,129,работаю начальником отделения почтовой связи ...


In [96]:
from collections import Counter
Counter(transformed_df.category)

Counter({55: 1361,
         139: 55,
         124: 61,
         45: 1084,
         63: 10,
         20: 42,
         59: 250,
         16: 37,
         30: 1186,
         54: 37,
         8: 300,
         57: 155,
         114: 2,
         7: 55,
         22: 163,
         137: 434,
         122: 141,
         31: 63,
         36: 74,
         117: 217,
         37: 21,
         56: 135,
         97: 4,
         73: 114,
         62: 27,
         98: 78,
         106: 25,
         72: 116,
         24: 41,
         6: 46,
         39: 57,
         21: 206,
         44: 140,
         53: 51,
         32: 9,
         125: 16,
         84: 36,
         132: 19,
         83: 120,
         89: 80,
         119: 18,
         35: 8,
         40: 54,
         17: 26,
         2: 50,
         23: 31,
         9: 15,
         74: 6,
         52: 39,
         68: 19,
         112: 2,
         49: 6,
         33: 41,
         10: 24,
         48: 4,
         71: 13,
         86: 4,
         81: 1,

## Models for comparison

In [97]:
rs = 42
clf = LogisticRegression(random_state=rs)   
clf2 = RandomForestClassifier(random_state=rs, n_jobs =-1)
clf3 = SGDClassifier()
clf4 = SVC(random_state =rs)
clf5 = DecisionTreeClassifier(random_state=rs)
clf6 = ExtraTreeClassifier()
clf7 = GradientBoostingClassifier(random_state=rs)
clflist = [clf2, clf3]#, clf3]#, clf4, clf5, clf6, clf7]

### Utils

In [98]:
def calculate_scores(X_train, y_train, X_test, y_test):
    scores = []
    for classif in clflist:
        clf = Pipeline([

        ('vect', CountVectorizer(ngram_range=(1,3), analyzer='word', max_features=10000)),

        ('tfidf', TfidfTransformer(sublinear_tf=True)),

        ('clf', classif)])

        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        print(classif)
        score = accuracy_score(y_test, predictions)
        print("F1-measure: {0:6.4f}".format(score))
        scores.append(score)
    return scores

## Calc metric for different targets

In [99]:
X = df["text"]
y = df["sphere"]
X_train, X_test, y_train, y_test, = train_test_split(X, y, test_size=0.5, random_state=42)
scores1 = calculate_scores(X_train, y_train, X_test, y_test)

RandomForestClassifier(n_jobs=-1, random_state=42)
F1-measure: 0.6703
SGDClassifier()
F1-measure: 0.7236


In [100]:
X = df["text"]
y = df["category"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
scores2 = calculate_scores(X_train, y_train, X_test, y_test)

RandomForestClassifier(n_jobs=-1, random_state=42)
F1-measure: 0.5549
SGDClassifier()
F1-measure: 0.5991


In [101]:
X = df["text"]
y = df["theme"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
scores3 = calculate_scores(X_train, y_train, X_test, y_test)

RandomForestClassifier(n_jobs=-1, random_state=42)
F1-measure: 0.3805
SGDClassifier()
F1-measure: 0.4188


### Trasformed data 

In [102]:
X = transformed_df["text"]
y = transformed_df["sphere"]
X_train, X_test, y_train, y_test, = train_test_split(X, y, test_size=0.5, random_state=42)
scores1 = calculate_scores(X_train, y_train, X_test, y_test)

RandomForestClassifier(n_jobs=-1, random_state=42)
F1-measure: 0.6703
SGDClassifier()
F1-measure: 0.7289


In [103]:
X = transformed_df["text"]
y = transformed_df["category"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
scores2 = calculate_scores(X_train, y_train, X_test, y_test)

RandomForestClassifier(n_jobs=-1, random_state=42)
F1-measure: 0.5549
SGDClassifier()
F1-measure: 0.5963


In [104]:
X = transformed_df["text"]
y = transformed_df["theme"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
scores3 = calculate_scores(X_train, y_train, X_test, y_test)

RandomForestClassifier(n_jobs=-1, random_state=42)
F1-measure: 0.3805
SGDClassifier()
F1-measure: 0.4176


## Find best model

In [105]:
final_score = []
for idx in range(len(clflist)):
    final_score.append(scores1[idx] + scores2[idx] + scores3[idx])

In [106]:
print(clflist[np.argmax(final_score)])

SGDClassifier()


So the best model is: SGDClassifier()

In [107]:
final_score[np.argmax(final_score)] / 3

0.5809139106286292

## Preprocess for dirty data

In [108]:
df1 = df.copy()

In [109]:
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()
preprocess = lambda text: ' '.join(tokenizer.tokenize(text.lower()))

text = 'How to be a grown-up at work: replace "I don\'t want to do that" with "Ok, great!".'
print("before:", text)
print("after:", preprocess(text))

before: How to be a grown-up at work: replace "I don't want to do that" with "Ok, great!".
after: how to be a grown-up at work : replace " i don't want to do that " with " ok , great ! " .


In [110]:
for i, text in enumerate(df1['text']):
    df1['text'][i] = preprocess(text)

In [111]:
# Any necessary preprocessing if needed
# YOUR CODE HERE
import nltk
from nltk import word_tokenize

In [112]:
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [113]:
for i, text in enumerate(df1['text']):
    df1['text'][i] = word_tokenize(text, language='russian')

In [114]:
df1.head()

Unnamed: 0,sphere,category,theme,text
0,Жилищно-коммунальное хозяйство,Многоквартирные дома,Нарушения в вопросах оплаты услуг ЖКХ,"[обращаются, жильцы, можем, заверить, что, и, ..."
1,Энергетика,Электроэнергетика,Организация электроснабжения населенных пункто...,"[обращаюсь, лица, моей, бабушки, проживающие, ..."
2,"Торговля, товары и услуги",Торговля,Нарушение санитарных требований к организациям...,"[связи, коронавирусом, в, нашей, стране, и, в,..."
3,Здравоохранение,Качество оказания медицинской помощи (в том чи...,Оказание медицинской помощи в стационаре,"[больнице, тракторной, отсутствует, горячая, в..."
4,Трудовые отношения,Нарушения в сфере охраны труда,Нарушения в сфере охраны труда,"[работаю, начальником, отделения, почтовой, св..."
