# Feedbacks classification task.
### Used libs

In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict

from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

### Paths to files

In [2]:
np.random.seed(500)
path_to_train_data = r'data/products_sentiment_train.tsv'
path_to_test_data = r'data/products_sentiment_test.tsv'

In [5]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ilyabasharov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/ilyabasharov/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ilyabasharov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ilyabasharov/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

### Text processing from information deprivation

In [6]:
def processing_data(path):
    data = pd.read_csv(path, encoding='utf-8', sep="\t")
    data['text'].dropna(inplace=True)
    data['text'] = [entry.lower() for entry in data['text']]
    data['text']= [word_tokenize(entry) for entry in data['text']]
    tag_map = defaultdict(lambda : wn.NOUN)
    tag_map['J'] = wn.ADJ
    tag_map['V'] = wn.VERB
    tag_map['R'] = wn.ADV
    for index,entry in enumerate(data['text']):
        Final_words = []
        word_Lemmatized = WordNetLemmatizer()
        for word, tag in pos_tag(entry):
            if word not in stopwords.words('english') and word.isalpha():
                word_Final = word_Lemmatized.lemmatize(word, tag_map[tag[0]])
                Final_words.append(word_Final)
    
        data.loc[index,'text_final'] = str(Final_words)
            
    return data

train_data = processing_data(path_to_train_data)
test_data = processing_data(path_to_test_data)

### Cutting on test and train

In [5]:
Train_X, Test_X, Train_Y, Test_Y = train_test_split(train_data['text_final'],\
                                                    train_data['class'], \
                                                    test_size=0.01,
                                                    shuffle = True)

###  Coding and counting words in texts

In [6]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(train_data['text_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
test_data_X = Tfidf_vect.transform(test_data['text_final'])
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

### Using SVM for predictions or SGC classifier

In [7]:
SVM = svm.SVC(C=0.01, kernel='linear', gamma='auto', probability=True, tol=0.00000001)
SVM.fit(np.log1p(Train_X_Tfidf), Train_Y)
predictions_SVM = SVM.predict(np.log1p(Test_X_Tfidf))
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  75.0


In [8]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': np.exp(np.linspace(-3, 4, 30))}
search_params = GridSearchCV(SVM, param_grid, scoring='neg_log_loss', cv = 5, n_jobs = -1)
search_params.fit(np.log1p(Train_X_Tfidf), Train_Y)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=SVC(C=0.01, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto', kernel='linear', max_iter=-1,
                           probability=True, random_state=None, shrinking=True,
                           tol=1e-08, verbose=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': array([4.97870684e-02, 6.33791276e-02, 8.06818...
       9.01722662e-01, 1.14789638e+00, 1.46127646e+00, 1.86021050e+00,
       2.36805505e+00, 3.01454310e+00, 3.83752484e+00, 4.88518373e+00,
       6.21885749e+00, 7.91662926e+00, 1.00778992e+01, 1.28292040e+01,
       1.63316256e+01, 2.07902216e+01, 2.64660312e+01, 3.36913585e+01,
       4.28892276e+01, 5.45981500e+01])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_log_loss', verbose=0)

In [9]:
search_params.best_params_

{'C': 0.7083424709523614}

In [14]:
SVM = svm.SVC(C=0.7083424709523614, kernel='linear', gamma='auto', probability=True, tol=0.00000001)
SVM.fit(np.log1p(Train_X_Tfidf), Train_Y)
result = SVM.predict_proba(test_data_X)

In [11]:
from sklearn.linear_model import SGDClassifier
classifier = SGDClassifier(loss='huber', penalty='l2', alpha=0.00001, l1_ratio=0.15, fit_intercept=True, \
                           max_iter=10000000, tol=0.00001, shuffle=True, epsilon=0.01, n_jobs=-1, \
                           learning_rate='adaptive', eta0 = 0.01)
classifier.fit(np.log1p(Train_X_Tfidf), Train_Y)
predictions_classifier = classifier.predict(np.log1p(Test_X_Tfidf))
print("classifier Accuracy Score -> ",accuracy_score(predictions_classifier, Test_Y)*100)

classifier Accuracy Score ->  75.0


### Obtaining class membership probabilities

In [12]:
result = SVM.predict_proba(test_data_X)

### Submit predictions on Kaggle

In [15]:
with open('new_solution.csv', 'w') as fout:
    print("Id,y", file=fout)
    for i in range(len(result)):
        print(i, (result[i][1]), sep=',', file=fout)