<a href="https://colab.research.google.com/github/GaoangLiu/ipynb/blob/master/Quora_Insincere_Questions_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Load packages 
import math
import re
import os
import timeit
import tensorflow as tf
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import logging
import time

from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
logging.basicConfig(format='[%(asctime)s %(levelname)8s] %(message)s', level=logging.INFO, datefmt='%m-%d %H:%M:%S')

# Get data
! rm *.csv
! wget -O quora.zip bwg.140714.xyz:8000/quora.zip 
! unzip quora.zip 
! ls 

In [0]:
# Base class for classifier
class Classifier():
  def __init__(self):
    self.train = None
    self.test = None 
    self.model = None

  def load_data(self, train_file='train.csv', test_file='test.csv'):
      """ Load train, test csv files and return pandas.DataFrame
      """
      self.train = pd.read_csv(train_file, engine='python', encoding='utf-8', error_bad_lines=False)
      self.test = pd.read_csv(test_file, engine='python', encoding='utf-8', error_bad_lines=False)
      logging.info('CSV data loaded')
  
  def countvectorize(self):
      tv = TfidfVectorizer(ngram_range=(1,3), token_pattern=r'\w{1,}',
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1, max_features=5000)
      tv = CountVectorizer()
      tv.fit(self.train.question_text)
      self.vector_train = tv.transform(self.train.question_text)
      self.vector_test  = tv.transform(self.test.question_text)
      logging.info("Train & test text tokenized")

  def build_model(self):
      pass

  def run_model(self):
      # Choose your own classifier: self.model and run it
      logging.info(f"{self.__class__.__name__} starts running.")
      labels = self.train.target
      x_train, x_val, y_train, y_val = train_test_split(self.vector_train, labels, test_size=0.2, random_state=2090)
      self.model.fit(x_train, y_train)
      y_preds = self.model.predict(x_val)

      logging.info(f"Accuracy score: {accuracy_score(y_val, y_preds)}")
      logging.info(f"Confusion matrix: ") 
      print(confusion_matrix(y_val, y_preds))
      print("Classificaiton report:\n", classification_report(y_val, y_preds, target_names=["Sincere", "Insincere"]))
      # y_preds = self.model.predict(self.vector_test)
      return y_preds

  def save_predictions(self, y_preds):
      sub = pd.read_csv(f"sample_submission.csv")
      sub['prediction'] = y_preds 
      sub.to_csv(f"submission_{self.__class__.__name__}.csv", index=False)
      logging.info('Prediction exported to submisison.csv')
  
  def pipeline(self):
      s_time = time.clock()
      self.load_data()
      self.countvectorize()
      self.build_model()
      self.save_predictions(self.run_model())
      logging.info(f"Program running for {time.clock() - s_time} seconds")

class C_Bayes(Classifier):
  def build_model(self):
      self.model = MultinomialNB()
      return self.model

# Logistic Regression 
class C_LR(Classifier):
  def build_model(self):
      self.model = LogisticRegression(n_jobs=10, solver='lbfgs', C=0.1, verbose=1)
      return self.model

class C_SVM(Classifier):
  def load_data(self, train_file='train.csv', test_file='test.csv'):
      """ Load train, test csv files and return pandas.DataFrame
      """
      self.train = pd.read_csv(train_file, engine='python', encoding='utf-8', error_bad_lines=False)
      self.train = self.train.sample(100000)
      self.test = pd.read_csv(test_file, engine='python', encoding='utf-8', error_bad_lines=False)
      logging.info('CSV data loaded')

  def build_model(self):
      self.model = svm.SVC()
      return self.model

class C_Ensemble(Classifier):
  def ensemble(self):
      s_time = time.perf_counter()
      self.load_data()
      self.countvectorize()

      nb = MultinomialNB()
      lr = LogisticRegression(n_jobs=10, solver='saga', C=0.1, verbose=1)
      svc = svm.SVC()

      all_preds = [0] * self.test.shape[0]
      for m in (nb, lr, svc):
          self.model = m
          if m == svc: 
              self.load_data()
              self.train = self.train.sample(10000)
              self.countvectorize()
          all_preds += self.run_model()

      all_preds = [1 if p > 0 else 0 for p in all_preds]
      self.save_predictions(all_preds)
      logging.info(f"Program running for {time.perf_counter() - s_time} seconds")


class Helper():
    def locate_threshold(self, model, x_val, y_val):
        y_probs = model.predict_proba(x_val)
        best_threshold = best_f1 = pre_f1 = 0
        history = []

        for i in range(0, 100):
          y2_preds = [1 if e[1] >= i / 100 else 0 for e in y_probs]
          cur_f1 = f1_score(y_val, y2_preds)
          history.append((i, cur_f1))
          symbol = '+' if cur_f1 >= pre_f1 else '-'
          print("Threshold {:6.4f}, f1_score: {:<0.8f}  {} {:<0.6f} ".format(i / 100, cur_f1, symbol, abs(cur_f1 - pre_f1)))
          pre_f1 = cur_f1

          if cur_f1 >= best_f1:
              best_f1 = cur_f1
              best_threshold = i / 100

        print(f"Best f1 score {best_f1}, best threshold {best_threshold}")
        plt.xlabel('Threshold')
        plt.ylabel('f1_score')
        plt.plot(*zip(*history))

## CNN
We've tried linear models, the best result of private`f1_score` we got is 0.62166. Now we try Neural Network

In [0]:
class C_NN(Classifier):
    def build_model(self):
        pass 


# Logistic Regression 
Besides NB, we also tried LR to maximum `f1_score`

2020.05.09: 
* Set `threshold = 0.21`, we got public score 0.60627 + private score = 0.62161. Bot are the maximum score we've got by now.
<img class="center" src="https://i.loli.net/2020/05/09/eE4KqloW52FDSdc.png" alt="LR max f1_score" width=850>
* Set `threshold=0.25`, `public_score=0.60619` decreased, but `private_score=0.62166` increased. 


In [0]:
c = C_LR()
c.load_data()
# b.train = b.train.sample(100000)
c.countvectorize()
labels = c.train.target
x_train, x_val, y_train, y_val = train_test_split(c.vector_train, labels, test_size=0.2, random_state=2090)

model = LogisticRegression(n_jobs=10, solver='saga', C=0.1, verbose=1)
model.fit(x_train, y_train)
y_preds = model.predict(x_val)

print(f"Accuracy score: {accuracy_score(y_val, y_preds)}")
print(f"Confusion matrix: ") 
print(confusion_matrix(y_val, y_preds))
print("Classificaiton report:\n", classification_report(y_val, y_preds, target_names=["Sincere", "Insincere"]))

In [0]:
# Find the best threshold to maximum f1_score
Helper().locate_threshold(model, x_val, y_val)


# Naive Bayes
Since in this contest submissions are evaluated on **F1 Score** between the predicted and the observed targets. Our ultimate goal is to maximum the `f1_socre`. 

2020.05.09: 
* Naive Bayes achieved the maximum score `f1_score = 0.56456695` when the `threshold` is set to `0.726`. This result in a public score = 0.56452 + private score = 0.56706. The public socre is the best we've got with NB, but the private score is only second to 0.56889, one when we set the `threshold` to 0.6 .



In [0]:
b = C_Bayes()
b.load_data()
b.countvectorize()
b.build_model()
labels = b.train.target
x_train, x_val, y_train, y_val = train_test_split(b.vector_train, labels, test_size=0.2, random_state=2090)
b.model.fit(x_train, y_train)
y_preds = b.model.predict(x_val)

logging.info(f"Accuracy score: {accuracy_score(y_val, y_preds)}")
logging.info(f"Confusion matrix: ") 
print(confusion_matrix(y_val, y_preds))
print("Classificaiton report:\n", classification_report(y_val, y_preds, target_names=["Sincere", "Insincere"]))



In [0]:
# Find the best threshold to maximum f1_score
Helper().locate_threshold(b.model, x_val, y_val)