# Classification Notebook

In [1]:
# Imports
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import pickle, spacy
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

## Import Dataset

In [2]:
data = pd.read_csv('datasets/small_data_sample.csv')

#rename text_filtered to text
data.rename(columns = {'text_filtered':'text'}, inplace = True)

## Fit and eval model with input data

In [3]:
def train_test_model(model, X_train, Y_train, X_test, Y_test):
    model.fit(X_train, Y_train)
    y_pred = model.predict(X_test)
    results = {}
    results['accuracy'] = accuracy_score(Y_test, y_pred)
    results['precision'] = precision_score(Y_test, y_pred, average='weighted')
    results['recall'] = recall_score(Y_test, y_pred, average='weighted')
    results['f1'] = f1_score(Y_test, y_pred, average='weighted')
    results['confusion_matrix'] = confusion_matrix(Y_test, y_pred)
    results['model'] = model
    results['y_pred'] = y_pred
    return results

## CountVectorizer

In [5]:
def countVectorizerAux(data, ngram_range_max):
    vectorizer = CountVectorizer(analyzer='word', lowercase=False, stop_words='english', ngram_range=(1, ngram_range_max))
    X = vectorizer.fit_transform(data['text']).toarray()
    y = data['label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
    results = {}
    results['X_train'] = X_train
    results['X_test'] = X_test
    results['y_train'] = y_train
    results['y_test'] = y_test
    return results

## TfidfVectorizer

In [6]:
def tfidfVectorizerAux(data, ngram_range_max):
    vectorizer = TfidfVectorizer(analyzer='word', lowercase=True, stop_words='english', ngram_range=(1, ngram_range_max))
    X = vectorizer.fit_transform(data['text']).toarray()
    y = data['label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
    results = {}
    results['X_train'] = X_train
    results['X_test'] = X_test
    results['y_train'] = y_train
    results['y_test'] = y_test
    return results

## Results

### Results from preprocessing.ipynb exploration

In [16]:
# load small_data_sample.csv
small_data_sample = pd.read_csv('datasets/small_data_sample.csv')

In [8]:
# import pickled data
with open('datasets/pickle/train_features.pkl', 'rb') as f:
    train_features = pickle.load(f)

with open('datasets/pickle/train_features_embeddes.pkl', 'rb') as f:
    train_features_embeddes = pickle.load(f)

with open('datasets/pickle/train_labels.pkl', 'rb') as f:
    train_labels = pickle.load(f)
    
with open('datasets/pickle/test_features.pkl', 'rb') as f:
    test_features = pickle.load(f)
    
with open('datasets/pickle/test_features_embeddes.pkl', 'rb') as f:
    test_features_embeddes = pickle.load(f)
    
with open('datasets/pickle/test_labels.pkl', 'rb') as f:
    test_labels = pickle.load(f)

In [9]:
def differences(model, train_features, test_features):
    pipeline = Pipeline([
        ('vectorizer', DictVectorizer()),
        ('classifier', model)
    ])

    pipeline.fit(train_features, train_labels)

    # Predict the test data
    preds = pipeline.predict(test_features)

    feat_dict = {}
    feat_dict['actual'] = test_labels
    feat_dict['predicted'] = preds
    df_results = pd.DataFrame(feat_dict)

    return df_results

#### LR All features

In [17]:
results = differences(LogisticRegression(C=0.1, penalty='l2', solver='liblinear'), train_features, test_features)
results.to_csv('datasets/exploration/lr.csv')
results.head()

Unnamed: 0,actual,predicted
0,3,2
1,1,1
2,2,2
3,0,0
4,2,3


### SVM Embeddigns

In [18]:
results = differences(SVC(C=1.0, kernel='rbf'), train_features_embeddes, test_features_embeddes)
results.to_csv('datasets/exploration/svc.csv')
print('done')

done


### CountVectorized

In [7]:
countVectorizerResults = countVectorizerAux(data, 1)

#### NB 

In [8]:
results_nb_simple = train_test_model(
    MultinomialNB(), 
    countVectorizerResults['X_train'], 
    countVectorizerResults['y_train'], 
    countVectorizerResults['X_test'], 
    countVectorizerResults['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


0.906375 0.9063295925819387 0.906375 0.9062452150820486
[[1816   59   95   54]
 [  23 1926   10    8]
 [  46   20 1704  187]
 [  72   13  162 1805]]


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is

#### DTC

In [9]:
results_nb_simple = train_test_model(
    DecisionTreeClassifier(max_depth=None, min_samples_split=5), 
    countVectorizerResults['X_train'], 
    countVectorizerResults['y_train'], 
    countVectorizerResults['X_test'], 
    countVectorizerResults['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


0.762125 0.7618556038210078 0.762125 0.761844000015797
[[1562  148  183  131]
 [ 135 1667   64  101]
 [ 181   80 1418  278]
 [ 148  113  341 1450]]


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is

#### LR

In [11]:
results_nb_simple = train_test_model(
    LogisticRegression(penalty='l2', solver='liblinear', C=0.1), 
    countVectorizerResults['X_train'], 
    countVectorizerResults['y_train'], 
    countVectorizerResults['X_test'], 
    countVectorizerResults['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


0.903875 0.9036895782766056 0.903875 0.9036574332288047
[[1803   58   99   64]
 [  30 1920    8    9]
 [  62   22 1684  189]
 [  68   22  138 1824]]


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is

#### RF

In [12]:
results_nb_simple = train_test_model(
    RandomForestClassifier(max_depth=None, min_samples_split=10), 
    countVectorizerResults['X_train'], 
    countVectorizerResults['y_train'], 
    countVectorizerResults['X_test'], 
    countVectorizerResults['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


0.87175 0.8717332045095789 0.87175 0.8713411485902552
[[1743   85  114   82]
 [  30 1884   20   33]
 [  61   48 1653  195]
 [  87   72  199 1694]]


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is

#### XGB

In [8]:
results_nb_simple = train_test_model(
    XGBClassifier(max_depth=9, n_estimators=300), 
    countVectorizerResults['X_train'], 
    countVectorizerResults['y_train'], 
    countVectorizerResults['X_test'], 
    countVectorizerResults['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])

0.895125 0.8949652181059014 0.895125 0.8949741593935056
[[1713   68   90   59]
 [  32 1886   27   22]
 [  66   30 1803  169]
 [  78   29  169 1759]]


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is

#### KNN

In [None]:
results_nb_simple = train_test_model(
    KNeighborsClassifier(n_neighbors=3, p=2, weights='distance'), 
    countVectorizerResults['X_train'], 
    countVectorizerResults['y_train'], 
    countVectorizerResults['X_test'], 
    countVectorizerResults['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])

### TFIDF Vectorizer

In [None]:
tfidfVectorizerResults = countVectorizerAux(data, 1)

#### NB

In [None]:
results_nb_simple = train_test_model(
    MultinomialNB(), 
    tfidfVectorizerResults['X_train'], 
    tfidfVectorizerResults['y_train'], 
    tfidfVectorizerResults['X_test'], 
    tfidfVectorizerResults['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])

#### DTC

In [None]:
results_nb_simple = train_test_model(
    DecisionTreeClassifier(max_depth=None, min_samples_split=5), 
    tfidfVectorizerResults['X_train'], 
    tfidfVectorizerResults['y_train'], 
    tfidfVectorizerResults['X_test'], 
    tfidfVectorizerResults['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])

#### LR

In [None]:
results_nb_simple = train_test_model(
    LogisticRegression(penalty='l2', solver='liblinear', c=0.1), 
    tfidfVectorizerResults['X_train'], 
    tfidfVectorizerResults['y_train'], 
    tfidfVectorizerResults['X_test'], 
    tfidfVectorizerResults['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])

#### RF

In [None]:
results_nb_simple = train_test_model(
    RandomForestClassifier(max_depth=None, min_samples_split=10), 
    tfidfVectorizerResults['X_train'], 
    tfidfVectorizerResults['y_train'], 
    tfidfVectorizerResults['X_test'], 
    tfidfVectorizerResults['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])

#### XGB

In [None]:
results_nb_simple = train_test_model(
    XGBClassifier(max_depth=9, n_estimators=300), 
    tfidfVectorizerResults['X_train'], 
    tfidfVectorizerResults['y_train'], 
    tfidfVectorizerResults['X_test'], 
    tfidfVectorizerResults['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])

#### KNN

In [None]:
results_nb_simple = train_test_model(
    KNeighborsClassifier(n_neighbors=3, p=2, weights='distance'), 
    tfidfVectorizerResults['X_train'], 
    tfidfVectorizerResults['y_train'], 
    tfidfVectorizerResults['X_test'], 
    tfidfVectorizerResults['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])