# Classification Notebook

In [None]:
# Imports
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier


## Import Dataset

In [None]:
data = pd.read_csv('datasets/small_data_filtered.csv')

#rename text_filtered to text
data.rename(columns = {'text_filtered':'text'}, inplace = True)

## Fit and eval model with input data

In [None]:
def train_test_model(model, X_train, Y_train, X_test, Y_test):
    model.fit(X_train, Y_train)
    y_pred = model.predict(X_test)
    results = {}
    results['accuracy'] = accuracy_score(Y_test, y_pred)
    results['precision'] = precision_score(Y_test, y_pred, average='weighted')
    results['recall'] = recall_score(Y_test, y_pred, average='weighted')
    results['f1'] = f1_score(Y_test, y_pred, average='weighted')
    results['confusion_matrix'] = confusion_matrix(Y_test, y_pred)
    results['model'] = model
    results['y_pred'] = y_pred
    return results

In [None]:
def display_confusion_matrix(matrix):
    # Display the confusion matrix as an image
    fig, ax = plt.subplots(figsize=(10, 10))
    sns.heatmap(matrix, annot=True, fmt='d', ax=ax)
    
    # Set the labels
    ax.set_xlabel('Predicted', fontsize=20)
    ax.set_ylabel('Actual', fontsize=20)
    ax.set_title('Confusion Matrix', fontsize=20)
    plt.show()


## CountVectorizer

In [None]:
def countVectorizerAux(data, ngram_range_max):
    vectorizer = CountVectorizer(analyzer='word', lowercase=False, stop_words='english', ngram_range=(1, ngram_range_max))
    X = vectorizer.fit_transform(data['text']).toarray()
    y = data['label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
    results = {}
    results['X_train'] = X_train
    results['X_test'] = X_test
    results['y_train'] = y_train
    results['y_test'] = y_test
    return results

## TfidfVectorizer

In [None]:
def tfidfVectorizerAux(data, ngram_range_max):
    vectorizer = TfidfVectorizer(analyzer='word', lowercase=True, stop_words='english', ngram_range=(1, ngram_range_max))
    X = vectorizer.fit_transform(data['text']).toarray()
    y = data['label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
    results = {}
    results['X_train'] = X_train
    results['X_test'] = X_test
    results['y_train'] = y_train
    results['y_test'] = y_test
    return results

## Results

### CountVectorized

#### NB 

##### Unigram

In [None]:
countVectorizerResults = countVectorizerAux(data, 1)
print(countVectorizerResults['X_train'])
results_nb_simple = train_test_model(
    MultinomialNB(), 
    countVectorizerResults['X_train'], 
    countVectorizerResults['y_train'], 
    countVectorizerResults['X_test'], 
    countVectorizerResults['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])
display_confusion_matrix(results_nb_simple['confusion_matrix'])

##### Bigram

In [None]:
countVectorizerResults = countVectorizerAux(data, 2)
results_nb_simple = train_test_model(
    MultinomialNB(), 
    countVectorizerResults['X_train'], 
    countVectorizerResults['y_train'], 
    countVectorizerResults['X_test'], 
    countVectorizerResults['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])
display_confusion_matrix(results_nb_simple['confusion_matrix'])

##### Trigram

In [None]:
countVectorizerResults = countVectorizerAux(data, 3)
results_nb_simple = train_test_model(
    MultinomialNB(), 
    countVectorizerResults['X_train'], 
    countVectorizerResults['y_train'], 
    countVectorizerResults['X_test'], 
    countVectorizerResults['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])
display_confusion_matrix(results_nb_simple['confusion_matrix'])

#### DTC

##### Unigram

In [None]:
vectorizerResults = countVectorizerAux(data, 1)
results_nb_simple = train_test_model(
    DecisionTreeClassifier(), 
    vectorizerResults['X_train'], 
    vectorizerResults['y_train'], 
    vectorizerResults['X_test'], 
    vectorizerResults['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])
display_confusion_matrix(results_nb_simple['confusion_matrix'])

##### Bigram

In [None]:
vectorizerResults = countVectorizerAux(data, 2)
results_nb_simple = train_test_model(
    DecisionTreeClassifier(), 
    vectorizerResults['X_train'], 
    vectorizerResults['y_train'], 
    vectorizerResults['X_test'], 
    vectorizerResults['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])
display_confusion_matrix(results_nb_simple['confusion_matrix'])

##### Trigram

In [None]:
vectorizerResults = countVectorizerAux(data, 3)
results_nb_simple = train_test_model(
    DecisionTreeClassifier(), 
    vectorizerResults['X_train'], 
    vectorizerResults['y_train'], 
    vectorizerResults['X_test'], 
    vectorizerResults['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])
display_confusion_matrix(results_nb_simple['confusion_matrix'])

### TFIDF Vectorizerd

#### NB

##### Unigram

In [None]:
vectorizerResults = tfidfVectorizerAux(data, 1)
results_nb_simple = train_test_model(
    MultinomialNB(), 
    vectorizerResults['X_train'], 
    vectorizerResults['y_train'], 
    vectorizerResults['X_test'], 
    vectorizerResults['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])
display_confusion_matrix(results_nb_simple['confusion_matrix'])

##### Bigram

In [None]:
vectorizerResults = tfidfVectorizerAux(data, 2)
results_nb_simple = train_test_model(
    MultinomialNB(), 
    vectorizerResults['X_train'], 
    vectorizerResults['y_train'], 
    vectorizerResults['X_test'], 
    vectorizerResults['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])
display_confusion_matrix(results_nb_simple['confusion_matrix'])

##### Trigram

In [None]:
vectorizerResults = tfidfVectorizerAux(data, 3)
results_nb_simple = train_test_model(
    MultinomialNB(), 
    vectorizerResults['X_train'], 
    vectorizerResults['y_train'], 
    vectorizerResults['X_test'], 
    vectorizerResults['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])
display_confusion_matrix(results_nb_simple['confusion_matrix'])

### DTC

#### Unigram

In [None]:
vectorizerResults = tfidfVectorizerAux(data, 1)
results_nb_simple = train_test_model(
    DecisionTreeClassifier(), 
    vectorizerResults['X_train'], 
    vectorizerResults['y_train'], 
    vectorizerResults['X_test'], 
    vectorizerResults['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])
display_confusion_matrix(results_nb_simple['confusion_matrix'])

#### Bigram

In [None]:
vectorizerResults = tfidfVectorizerAux(data, 2)
results_nb_simple = train_test_model(
    DecisionTreeClassifier(), 
    vectorizerResults['X_train'], 
    vectorizerResults['y_train'], 
    vectorizerResults['X_test'], 
    vectorizerResults['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])
display_confusion_matrix(results_nb_simple['confusion_matrix'])

#### Trigram

In [None]:
vectorizerResults = tfidfVectorizerAux(data, 3)
results_nb_simple = train_test_model(
    DecisionTreeClassifier(), 
    vectorizerResults['X_train'], 
    vectorizerResults['y_train'], 
    vectorizerResults['X_test'], 
    vectorizerResults['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])
display_confusion_matrix(results_nb_simple['confusion_matrix'])