In [27]:
import pandas as pd
import numpy as np

use_drive = True
if use_drive:
  PATH = "/content/drive/MyDrive/CIL 2022/"
  from google.colab import drive
  drive.mount('/content/drive')
  %cd /content/drive/MyDrive/CIL 2022/
  !ls
else:
  PATH = "./"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/CIL 2022
 analyze_data.ipynb			 minimal_preprocessing.ipynb
'Copia de BoW_v2.ipynb'			 new_data
'Copia de Linear_Models.ipynb'		 text_to_csv.ipynb
'Copia de minimal_preprocessing.ipynb'	 wordCloudneg.png
'Copia de text_to_csv.ipynb'		 wordCloudpos.png
 data					'Word embeddings'
 Linear_Models.ipynb


# Choose the Preprocessing

In [28]:
# HYPERPARAMETERS

PREPROCESSING_OPTIONS = [ "raw",
"minimal_preprocessing",
"no-stemming_no-lemmatize_no-stopwords_no-spellcorrect",
"no-stemming_no-lemmatize_with-stopwords_no-spellcorrect",
"no-stemming_no-lemmatize_with-stopwords_with-spellcorrect",
"no-stemming_with-lemmatize_with-stopwords_no-spellcorrect",
"no-stemming_with-lemmatize_with-stopwords_with-spellcorrect",
"with-stemming_no-lemmatize_with-stopwords_no-spellcorrect",
"with-stemming_with-lemmatize_no-stopwords_with-spellcorrect",
"with-stemming_with-lemmatize_with-stopwords_no-spellcorrect" ]
PREPROCESSING_CHOICE = PREPROCESSING_OPTIONS[1] # one from PREPROCESSING_OPTIONS

# Choose the Algorithms

In [40]:
# Feature extraction: 0 - BoW, 1 - TF-IDF
Vectorizer = 1

n_gram = 2
max_df = 1 # only for TF-IDF
max_features = 50000

# Feature selection: 0- None, 1 - F-score, 2 - Chi2, 3 - Mutual information, 4 - Variance Threshold
# mutual information can only be used with BoW
Selection = 2

k = 5000
v = 0.001 # only for Variance Threshold


# Classifier: 0 - MultinomialNB, 1 - SVM, 2 - LogisticReg
Classifier = 1

C = 1


# Load the data

In [41]:
def read_file_and_strip(filename):
  lines = []
  with open(filename) as file:
    for line in file:
      lines.append(line.strip())
  return np.asarray(lines)

def read_data():
  dataset_path = PATH + "data/" + PREPROCESSING_CHOICE + "/"

  train_sentences = read_file_and_strip(dataset_path + "train_sentences.txt")
  train_labels = read_file_and_strip(dataset_path + "train_labels.txt").astype(int)
  val_sentences = read_file_and_strip(dataset_path + "val_sentences.txt")
  val_labels = read_file_and_strip(dataset_path + "val_labels.txt").astype(int)
  
  return train_sentences, train_labels, val_sentences, val_labels

train_sentences, train_labels, val_sentences, val_labels = read_data()

# Feature Extraction

In [42]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = None
if Vectorizer == 0:
  vectorizer = CountVectorizer(max_features = max_features)
elif max_df==1:
  vectorizer = TfidfVectorizer(max_features = max_features, 
                               ngram_range=(1,n_gram))
else:
  vectorizer = TfidfVectorizer(max_features = max_features, 
                               ngram_range=(1,n_gram),
                               max_df=max_df)


X_train = vectorizer.fit_transform(train_sentences)
X_val = vectorizer.transform(val_sentences)

y_train = train_labels
y_val = val_labels

# Feature selection

In [43]:
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif, VarianceThreshold

selection = None

if Selection == 1:
  selection = SelectKBest(k=k)
elif Selection == 2:
  selection = SelectKBest(chi2, k=k)
elif Selection == 3:
  selection = SelectKBest(mutual_info_classif, k=k)
elif Selection == 4:
  selection = VarianceThreshold(v)

if selection != None:
  selection.fit(X_train, y_train)
  X_train = selection.transform(X_train)
  X_val = selection.transform(X_val)


# Classification

In [44]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

model = None

if Classifier == 0:
  model = MultinomialNB()
elif Classifier == 1:
  model = LinearSVC(C=C)
else:
  model = LogisticRegression(C=C)

model.fit(X_train, y_train)
y_val_pred = model.predict(X_val)

# Results

In [45]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score

def print_results(y_val, y_val_pred):
  print(f'Acc: {accuracy_score(y_val, y_val_pred)}')
  print(f'Recall: {recall_score(y_val, y_val_pred)}')
  print(f'Precision: {precision_score(y_val, y_val_pred)}')
  print(f'F1: {f1_score(y_val, y_val_pred)}')
  print(f'ROC_AUC: {roc_auc_score(y_val, y_val_pred)}')
  return None

print_results(y_val, y_val_pred)


Acc: 0.7982902369092134
Recall: 0.8473831402317219
Precision: 0.7693888902324006
F1: 0.8065047657675929
ROC_AUC: 0.7986719046903011


# Analysis of the classification


In [49]:
#Extracted features
vectorizer.get_feature_names_out().tofile("extracted_features.txt", sep=" ")

#Missclassifications
import csv

ferrors = open("errors.csv", "w", newline='')
writer = csv.writer(ferrors)
errors = np.not_equal(y_val, y_val_pred)

etl = zip(errors, X_val, y_val_pred)

for e,t,l in etl:
  if e:
    
    writer.writerow([t[:-1], l])
ferrors.close()