# Classification Notebook

In [None]:
# Imports
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import pickle, spacy
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import DictVectorizer

## Import Dataset

In [None]:
data = pd.read_csv('datasets/small_data_sampled.csv')

#rename text_filtered to text
data.rename(columns = {'text_filtered':'text'}, inplace = True)

## Fit and eval model with input data

In [None]:
def train_test_model(model, X_train, Y_train, X_test, Y_test):
    model.fit(X_train, Y_train)
    y_pred = model.predict(X_test)
    results = {}
    results['accuracy'] = accuracy_score(Y_test, y_pred)
    results['precision'] = precision_score(Y_test, y_pred, average='weighted')
    results['recall'] = recall_score(Y_test, y_pred, average='weighted')
    results['f1'] = f1_score(Y_test, y_pred, average='weighted')
    results['confusion_matrix'] = confusion_matrix(Y_test, y_pred)
    results['model'] = model
    results['y_pred'] = y_pred
    return results

In [None]:
def display_confusion_matrix(matrix):
    # Display the confusion matrix as an image
    fig, ax = plt.subplots(figsize=(10, 10))
    sns.heatmap(matrix, annot=True, fmt='d', ax=ax)
    
    # Set the labels
    ax.set_xlabel('Predicted', fontsize=20)
    ax.set_ylabel('Actual', fontsize=20)
    ax.set_title('Confusion Matrix', fontsize=20)
    plt.show()


## CountVectorizer

In [None]:
def countVectorizerAux(data, ngram_range_max):
    vectorizer = CountVectorizer(analyzer='word', lowercase=False, stop_words='english', ngram_range=(1, ngram_range_max))
    X = vectorizer.fit_transform(data['text']).toarray()
    y = data['label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
    results = {}
    results['X_train'] = X_train
    results['X_test'] = X_test
    results['y_train'] = y_train
    results['y_test'] = y_test
    return results

## TfidfVectorizer

In [None]:
def tfidfVectorizerAux(data, ngram_range_max):
    vectorizer = TfidfVectorizer(analyzer='word', lowercase=True, stop_words='english', ngram_range=(1, ngram_range_max))
    X = vectorizer.fit_transform(data['text']).toarray()
    y = data['label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
    results = {}
    results['X_train'] = X_train
    results['X_test'] = X_test
    results['y_train'] = y_train
    results['y_test'] = y_test
    return results

## Results

### Results from preprocessing.ipynb exploration

In [None]:
# load small_data_sample.csv
small_data_sample = pd.read_csv('datasets/small_data_sample.csv')

In [None]:
# import pickled data
with open('datasets/pickle/train_features.pkl', 'rb') as f:
    train_features = pickle.load(f)

with open('datasets/pickle/train_features_embeddes.pkl', 'rb') as f:
    train_features_embeddes = pickle.load(f)

with open('datasets/pickle/train_features_text_and_tokens.pkl', 'rb') as f:
    train_features_text_and_tokens = pickle.load(f)

with open('datasets/pickle/train_features_entities.pkl', 'rb') as f:
    train_features_entities = pickle.load(f)

with open('datasets/pickle/train_features_no_er.pkl', 'rb') as f:
    train_features_no_er = pickle.load(f)

with open('datasets/pickle/train_labels.pkl', 'rb') as f:
    train_labels = pickle.load(f)
    
with open('datasets/pickle/test_features.pkl', 'rb') as f:
    test_features = pickle.load(f)
    
with open('datasets/pickle/test_features_embeddes.pkl', 'rb') as f:
    test_features_embeddes = pickle.load(f)
    
with open('datasets/pickle/test_features_text_and_tokens.pkl', 'rb') as f:
    test_features_text_and_tokens = pickle.load(f)
    
with open('datasets/pickle/test_features_entities.pkl', 'rb') as f:
    test_features_entities = pickle.load(f)
    
with open('datasets/pickle/test_features_no_er.pkl', 'rb') as f:
    test_features_no_er = pickle.load(f)
    
with open('datasets/pickle/test_labels.pkl', 'rb') as f:
    test_labels = pickle.load(f)

In [None]:
def differences(model, train_features, test_features):
    pipeline = Pipeline([
        ('vectorizer', DictVectorizer()),
        ('classifier', model)
    ])

    pipeline.fit(train_features, train_labels)

    # Predict the test data
    preds = pipeline.predict(test_features)

    # Calculate the accuracy
    accuracy = accuracy_score(test_labels, preds)
    f1 = f1_score(test_labels, preds, average='weighted')
    precision = precision_score(test_labels, preds, average='weighted')
    recall = recall_score(test_labels, preds, average='weighted')

    # Create a DataFrame to store results
    df_results = pd.DataFrame({
        'features': test_features,
        'actual_labels': test_labels,
        'predicted_labels': preds,
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    })

    # Filter out instances where prediction differs from the actual label
    different_predictions = df_results[df_results['actual_labels'] != df_results['predicted_labels']]

    return different_predictions

#### LR All features

In [None]:
results = differences(LogisticRegression(c=0.1, penalty='l2', solver='liblinear'), train_features, test_features)
results.to_csv('/datasets/exploration/lr.csv')

### SVM Embeddigns

In [None]:
results = differences(SVC(c=1.0, kernel='rbf'), train_features, test_features)
results.to_csv('/datasets/exploration/svc.csv')

### CountVectorized

In [None]:
countVectorizerResults = countVectorizerAux(data, 1)
countVectorizerResults2 = countVectorizerAux(data, 2)
countVectorizerResults3 = countVectorizerAux(data, 3)

#### NB 

##### Unigram

In [None]:
results_nb_simple = train_test_model(
    MultinomialNB(), 
    countVectorizerResults['X_train'], 
    countVectorizerResults['y_train'], 
    countVectorizerResults['X_test'], 
    countVectorizerResults['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])

##### Bigram

In [None]:
results_nb_simple = train_test_model(
    MultinomialNB(), 
    countVectorizerResults2['X_train'], 
    countVectorizerResults2['y_train'], 
    countVectorizerResults2['X_test'], 
    countVectorizerResults2['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])

##### Trigram

In [None]:
results_nb_simple = train_test_model(
    MultinomialNB(), 
    countVectorizerResults3['X_train'], 
    countVectorizerResults3['y_train'], 
    countVectorizerResults3['X_test'], 
    countVectorizerResults3['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])

#### DTC

##### Unigram

In [None]:
results_nb_simple = train_test_model(
    DecisionTreeClassifier(max_depth=None, min_samples_split=5), 
    countVectorizerResults['X_train'], 
    countVectorizerResults['y_train'], 
    countVectorizerResults['X_test'], 
    countVectorizerResults['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])

##### Bigram

In [None]:
results_nb_simple = train_test_model(
    DecisionTreeClassifier(max_depth=None, min_samples_split=5), 
    countVectorizerResults2['X_train'], 
    countVectorizerResults2['y_train'], 
    countVectorizerResults2['X_test'], 
    countVectorizerResults2['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])

##### Trigram

In [None]:
results_nb_simple = train_test_model(
    DecisionTreeClassifier(max_depth=None, min_samples_split=5), 
    countVectorizerResults3['X_train'], 
    countVectorizerResults3['y_train'], 
    countVectorizerResults3['X_test'], 
    countVectorizerResults3['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])

#### LR

##### Unigram

In [None]:
results_nb_simple = train_test_model(
    LogisticRegression(penalty='l2', solver='liblinear'), 
    countVectorizerResults['X_train'], 
    countVectorizerResults['y_train'], 
    countVectorizerResults['X_test'], 
    countVectorizerResults['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])

##### Bigram

In [None]:
results_nb_simple = train_test_model(
    LogisticRegression(penalty='l2', solver='liblinear'), 
    countVectorizerResults2['X_train'], 
    countVectorizerResults2['y_train'], 
    countVectorizerResults2['X_test'], 
    countVectorizerResults2['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])

##### Trigram

In [None]:
results_nb_simple = train_test_model(
    LogisticRegression(penalty='l2', solver='liblinear'), 
    countVectorizerResults3['X_train'], 
    countVectorizerResults3['y_train'], 
    countVectorizerResults3['X_test'], 
    countVectorizerResults3['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])

#### RF

##### Unigram

In [None]:
results_nb_simple = train_test_model(
    RandomForestClassifier(max_depth=None, min_samples_split=10), 
    countVectorizerResults['X_train'], 
    countVectorizerResults['y_train'], 
    countVectorizerResults['X_test'], 
    countVectorizerResults['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])

##### Bigram

In [None]:
results_nb_simple = train_test_model(
    RandomForestClassifier(max_depth=None, min_samples_split=10), 
    countVectorizerResults2['X_train'], 
    countVectorizerResults2['y_train'], 
    countVectorizerResults2['X_test'], 
    countVectorizerResults2['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])

##### Trigram

In [None]:
results_nb_simple = train_test_model(
    RandomForestClassifier(max_depth=None, min_samples_split=10), 
    countVectorizerResults3['X_train'], 
    countVectorizerResults3['y_train'], 
    countVectorizerResults3['X_test'], 
    countVectorizerResults3['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])

#### SVM

##### Unigram

In [None]:
results_nb_simple = train_test_model(
    SVC(c=10, kernel= 'rbf'), 
    countVectorizerResults['X_train'], 
    countVectorizerResults['y_train'], 
    countVectorizerResults['X_test'], 
    countVectorizerResults['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])

##### Bigram

In [None]:
results_nb_simple = train_test_model(
    SVC(c=10, kernel= 'rbf'), 
    countVectorizerResults2['X_train'], 
    countVectorizerResults2['y_train'], 
    countVectorizerResults2['X_test'], 
    countVectorizerResults2['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])

##### Trigram

In [None]:
results_nb_simple = train_test_model(
    SVC(c=10, kernel= 'rbf'), 
    countVectorizerResults3['X_train'], 
    countVectorizerResults3['y_train'], 
    countVectorizerResults3['X_test'], 
    countVectorizerResults3['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])

### TFIDF Vectorizerd

In [None]:
tfidfVectorizerResults = countVectorizerAux(data, 1)
tfidfVectorizerResults2 = countVectorizerAux(data, 2)
tfidfVectorizerResults3 = countVectorizerAux(data, 3)

#### NB

##### Unigram

In [None]:
results_nb_simple = train_test_model(
    MultinomialNB(), 
    tfidfVectorizerResults['X_train'], 
    tfidfVectorizerResults['y_train'], 
    tfidfVectorizerResults['X_test'], 
    tfidfVectorizerResults['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])

##### Bigram

In [None]:
results_nb_simple = train_test_model(
    MultinomialNB(), 
    tfidfVectorizerResults2['X_train'], 
    tfidfVectorizerResults2['y_train'], 
    tfidfVectorizerResults2['X_test'], 
    tfidfVectorizerResults2['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])

##### Trigram

In [None]:
results_nb_simple = train_test_model(
    MultinomialNB(), 
    tfidfVectorizerResults3['X_train'], 
    tfidfVectorizerResults3['y_train'], 
    tfidfVectorizerResults3['X_test'], 
    tfidfVectorizerResults3['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])

#### DTC

##### Unigram

In [None]:
results_nb_simple = train_test_model(
    DecisionTreeClassifier(max_depth=None, min_samples_split=5), 
    tfidfVectorizerResults['X_train'], 
    tfidfVectorizerResults['y_train'], 
    tfidfVectorizerResults['X_test'], 
    tfidfVectorizerResults['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])

##### Bigram

In [None]:
results_nb_simple = train_test_model(
    DecisionTreeClassifier(max_depth=None, min_samples_split=5), 
    tfidfVectorizerResults2['X_train'], 
    tfidfVectorizerResults2['y_train'], 
    tfidfVectorizerResults2['X_test'], 
    tfidfVectorizerResults2['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])

##### Trigram

In [None]:
results_nb_simple = train_test_model(
    DecisionTreeClassifier(max_depth=None, min_samples_split=5), 
    tfidfVectorizerResults3['X_train'], 
    tfidfVectorizerResults3['y_train'], 
    tfidfVectorizerResults3['X_test'], 
    tfidfVectorizerResults3['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])

#### LR

##### Unigram

In [None]:
results_nb_simple = train_test_model(
    LogisticRegression(penalty='l2', solver='liblinear'), 
    tfidfVectorizerResults['X_train'], 
    tfidfVectorizerResults['y_train'], 
    tfidfVectorizerResults['X_test'], 
    tfidfVectorizerResults['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])

##### Bigram

In [None]:
results_nb_simple = train_test_model(
    LogisticRegression(penalty='l2', solver='liblinear'), 
    tfidfVectorizerResults2['X_train'], 
    tfidfVectorizerResults2['y_train'], 
    tfidfVectorizerResults2['X_test'], 
    tfidfVectorizerResults2['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])

##### Trigram

In [None]:
results_nb_simple = train_test_model(
    LogisticRegression(penalty='l2', solver='liblinear'), 
    tfidfVectorizerResults3['X_train'], 
    tfidfVectorizerResults3['y_train'], 
    tfidfVectorizerResults3['X_test'], 
    tfidfVectorizerResults3['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])

#### RF

##### Unigram

In [None]:
results_nb_simple = train_test_model(
    RandomForestClassifier(max_depth=None, min_samples_split=10), 
    tfidfVectorizerResults['X_train'], 
    tfidfVectorizerResults['y_train'], 
    tfidfVectorizerResults['X_test'], 
    tfidfVectorizerResults['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])

##### Bigram

In [None]:
results_nb_simple = train_test_model(
    RandomForestClassifier(max_depth=None, min_samples_split=10), 
    tfidfVectorizerResults2['X_train'], 
    tfidfVectorizerResults2['y_train'], 
    tfidfVectorizerResults2['X_test'], 
    tfidfVectorizerResults2['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])

##### Trigram

In [None]:
results_nb_simple = train_test_model(
    RandomForestClassifier(max_depth=None, min_samples_split=10), 
    tfidfVectorizerResults3['X_train'], 
    tfidfVectorizerResults3['y_train'], 
    tfidfVectorizerResults3['X_test'], 
    tfidfVectorizerResults3['y_test'],
    )
print(results_nb_simple['accuracy'], results_nb_simple['precision'], results_nb_simple['recall'], results_nb_simple['f1'])
print(results_nb_simple['confusion_matrix'])